def CopyTo(self, output): self._debug_in_way = False self._debug_in_relation = False self.log("starting nodes") self._output = output parser = make_parser() parser.setContentHandler(self) parser.parse(self._GetFile())
def parse(self): parser = sax.make_parser() parser.setContentHandler(self.handler) parser.setFeature(sax.handler.feature_namespaces, 1) parser.parse(self.xcri_file) #buffered_data = self.xcri_file.read(self.buffer_size) #while buffered_data: # parser.feed(buffered_data) # buffered_data = self.xcri_file.read(self.buffer_size) #parser.close() # transformations for p in self.handler.presentations: try: p['provider_title'] = p['provider_title'][0] p['course_title'] = p['course_title'][0] p['course_identifier'] = self._get_identifier(p['course_identifier']) p['course_description'] = ''.join(p['course_description']) presentation_id = self._get_identifier(p['presentation_identifier']) if not presentation_id: # Presentation identifier is the main ID for a document # if there is no ID, we do not want to import it raise Exception("Presentation with no ID") p['presentation_identifier'] = presentation_id if 'presentation_start' in p: p['presentation_start'] = self._date_to_solr_format(p['presentation_start'][0]) if 'presentation_end' in p: p['presentation_end'] = self._date_to_solr_format(p['presentation_end'][0]) if 'presentation_applyFrom' in p: p['presentation_applyFrom'] = self._date_to_solr_format(p['presentation_applyFrom'][0]) if 'presentation_applyUntil' in p: p['presentation_applyUntil'] = self._date_to_solr_format(p['presentation_applyUntil'][0]) if 'presentation_bookingEndpoint' in p: p['presentation_bookingEndpoint'] = p['presentation_bookingEndpoint'][0] if 'presentation_memberApplyTo' in p: p['presentation_memberApplyTo'] = p['presentation_memberApplyTo'][0] if 'presentation_attendanceMode' in p: p['presentation_attendanceMode'] = p['presentation_attendanceMode'][0] if 'presentation_attendancePattern' in p: p['presentation_attendancePattern'] = p['presentation_attendancePattern'][0] if 'presentation_venue_identifier' in p: # we're only interested by OxPoints ID atm oxpoints = self._get_identifier(p['presentation_venue_identifier'], uri_base="http://oxpoints.oucs.ox.ac.uk/id/") if oxpoints: p['presentation_venue_identifier'] = 'oxpoints:{id}'.format(id=oxpoints) else: del p['presentation_venue_identifier'] p['course_subject'] = [subject for subject in p['course_subject'] if subject not in self.ignore_subjects] self.presentations.append(p) except Exception as e: logger.warning("Couldn't transform presentation", exc_info=True, extra={'presentation': p})
def parse_file(input_file, output_file): # Create an XMLReader parser = xml.sax.make_parser() # Turn off namepsaces parser.setFeature(xml.sax.handler.feature_namespaces, 0) # Override the default ContextHandler Handler = TemplatesAndModulesHandler(input_file, output_file) parser.setContentHandler(Handler) parser.parse(input_file)
def handle(self, *args, **options): with open(options['config']) as f: config = yaml.load(f.read()) # get the timeStamp using a SAX parser handler = TimeStampHandler() parser = xml.sax.make_parser() parser.setContentHandler(handler) with open(config['file_name']) as f: parser.parse(f) time_stamp = handler.time_stamp data_source = DataSource(config['file_name']) counter = Counter() for feature in tqdm(data_source[0]): # parse the point from the point in the feature point = GEOSGeometry(str(feature.geom), srid=25833) try: tree = Tree.objects.get(location=point) counter['updated'] += 1 except Tree.DoesNotExist: tree = Tree(location=point, created=time_stamp) counter['created'] += 1 for attr in ['identifier', 'species', 'genus', 'borough']: key = config['fields'].get(attr) if key: value = feature[key].value setattr(tree, attr, value) for attr in ['year', 'age', 'circumference', 'height']: key = config['fields'].get(attr) if key: value = feature[key].value setattr(tree, attr, value) tree.feature_name = config['feature_name'] tree.updated = time_stamp if options['dry']: print(tree, tree.properties) else: tree.save() print(counter)
def make_parser(handler): """ Convenience function to construct a document parser with namespaces enabled and validation disabled. Document validation is a nice feature, but enabling validation can require the LIGO LW DTD to be downloaded from the LDAS document server if the DTD is not included inline in the XML. This requires a working connection to the internet and the server to be up. """ parser = sax.make_parser() parser.setContentHandler(handler) parser.setFeature(sax.handler.feature_namespaces, True) parser.setFeature(sax.handler.feature_validation, False) parser.setFeature(sax.handler.feature_external_ges, False) return parser
def get_timestamp(filename): class GMLHandler(xml.sax.ContentHandler): timestamp = None def startElement(self, name, attrs): if name == "wfs:FeatureCollection": self.timestamp = attrs['timeStamp'] handler = GMLHandler() parser = xml.sax.make_parser() parser.setContentHandler(handler) parser.parse(filename) timestamp = iso8601.parse_date(handler.timestamp, default_timezone=None) return pytz.timezone(settings.TIME_ZONE).localize(timestamp)
def parse_revisions(filename): parser = xml.sax.make_parser(["xml.sax.IncrementalParser"]) ready = deque() def deliver(x): ready.append(x) handler = DocumentHandler() handler.callback = deliver parser.setContentHandler(handler) with gzip.GzipFile(filename, "r") as raw_f: f = codecs.EncodedFile(raw_f, "utf8") for line in f: parser.feed(line) while ready: yield ready.popleft() parser.close() while ready: yield ready.popleft()
def CopyTo(self, output): self._output = output parser = make_parser() parser.setContentHandler(self) parser.parse(self._GetFile())
def upload_handler(): addr = get_ip(request) acheck = get_auth(addr, request, ACCESS_UPLOAD) user = acheck['nick'] # check if the post request has the file part if 'file' not in request.files: app.logger.warning('%s (%s): No file part', addr, user) abort(500) file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': app.logger.warning('%s (%s): No selected file', addr, user) abort(500) if file and allowed_file(file.filename): H = RegHandler() hd = hashlib.sha256() hda = hashlib.sha256() mtime = "" size = 0 uniqname = "" tempdir = os.path.join(app.config['UPLOAD_FOLDER'], str(uuid.uuid4())) os.mkdir(tempdir) # New filename harcoded - dump.zip filename = os.path.join(tempdir, 'dump.zip') file.save(filename) asize = os.path.getsize(filename) # Check zip archive if zipfile.is_zipfile(filename): with zipfile.ZipFile(filename) as myzip: try: # check for dump.xml and dump.xml.sig presents info1 = myzip.getinfo('dump.xml') mtime = "%4d-%02d-%02dT%02d:%02d:%02d+03:00" % info1.date_time # fixed time zone size = info1.file_size info2 = myzip.getinfo('dump.xml.sig') # extract dump.xml and dump.xml.sig myxml = myzip.extract('dump.xml', tempdir) mysig = myzip.extract('dump.xml.sig', tempdir) # check signature tempfile = os.path.join(tempdir, 'dump.xml.temp') args = [ 'openssl', 'smime', '-verify', '-engine', 'gost', '-in', mysig, '-noverify', '-inform', 'DER', '-content', myxml, '-out', tempfile ] if subprocess.call(args) == 0: parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 0) parser.setContentHandler(H) parser.parse(myxml) else: app.logger.error("%s (%s): OpenSSL execute error %s", addr, user, sys.exc_info()[1]) abort(500) # get hash sha256 with open(myxml, 'rb') as fh: s = b'' p = b'' fl = 0 for block in iter(lambda: fh.read(BLOCK_SIZE), b''): hda.update(block) if fl == 0: s += block if _rb.match(s): hd.update(_rb.sub(b"", s)) fl = 1 elif fl == 1: s = p + block if _re.search(s): hd.update(_re.sub(b"", s)) fl = 2 else: hd.update(p) p = block # clear for temporary files if os.path.exists(tempfile): os.unlink(tempfile) if os.path.exists(myxml): os.unlink(myxml) if os.path.exists(mysig): os.unlink(mysig) except: app.logger.error("%s (%s): Check error %s", addr, user, sys.exc_info()[1]) abort(500) uniqid = hda.hexdigest() realid = hd.hexdigest() uniqname = uniqid + '.zip' datadir = os.path.join(app.config['DATA_FOLDER'], uniqname[0:2], uniqname[2:4]) if not os.path.exists(datadir): os.makedirs(datadir) newfilename = os.path.join(datadir, uniqname) # check batabase cur = mysql.connection.cursor() cur.execute('SELECT * FROM dumps WHERE id = %s', (uniqid, )) rv = cur.fetchall() dump = rv[0] if rv else None _mtime = dateutil.parser.parse(mtime).timestamp() if dump is not None: app.logger.warning('%s (%s): Record %s already exists', addr, user, uniqname) if dump['a'] == 0: add_file(addr, user, uniqname, filename, newfilename) app.logger.warning('%s (%s): Archive %s, skipping...', addr, user, uniqname) if _mtime != dump['m']: app.logger.warning( '%s (%s): Record %s has mtime: %s, but file has mtime: %s', addr, user, uniqname, dump['m'], _mtime) else: add_file(addr, user, uniqname, filename, newfilename) cur.execute( 'INSERT INTO `dumps` (`id`, `crc`, `ut`, `utu`, `m`, `as`, `s`, `u`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', (uniqid, realid, int( H.updateTime_ut), int(H.updateTimeUrgently_ut), int(_mtime), size, asize, int(time.time()))) mysql.connection.commit() app.logger.info("%s (%s): %s (%s) record was added", addr, user, uniqid, realid) if os.path.exists(filename): os.unlink(filename) if os.path.exists(tempdir): os.rmdir(tempdir) else: app.logger.error("%s (%s): Not zip file", addr, user) abort(500) return ''' <!doctype html> <title>OK</title> <h1>OK %s updateTime=%s updateTimeUrgently=%s mtime=%s</h1> ''' % (uniqname, int( H.updateTime_ut), int(H.updateTimeUrgently_ut), int(_mtime)) else: app.logger.error("%s (%s): Bogus filename %s", addr, user, file.filename) abort(500) return '''
def upload_handler(): addr = get_ip(request) acheck = get_auth(addr, request, ACCESS_UPLOAD) user = acheck['nick'] # check if the post request has the file part if 'file' not in request.files: app.logger.warning('%s (%s): No file part', addr, user) abort(500) file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': app.logger.warning('%s (%s): No selected file', addr, user) abort(500) if file and allowed_file(file.filename): H = RegHandler() hd = hashlib.sha256() hda = hashlib.sha256() size = 0 uniqname = "" tempdir = os.path.join(app.config['UPLOAD_FOLDER'], str(uuid.uuid4())) os.mkdir(tempdir) # New filename harcoded - dump.xml filename = os.path.join(tempdir, 'dump.xml') file.save(filename) # Check zip archive try: # SOME CHECK???!!! args = [ 'xmllint', '--noout', '--schema', '/srv/dumpby/dump.xsd', filename ] rcode = subprocess.call(args) if rcode == 0: app.logger.info("%s (%s): check passed", addr, user) else: app.logger.error("%s (%s): xmllint execute error %d", addr, user, rcode) raise # parse xml (get updatetime) parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 0) parser.setContentHandler(H) parser.parse(filename) # get hash sha256 with open(filename, 'rb') as fh: s = b'' p = b'' fl = 0 for block in iter(lambda: fh.read(BLOCK_SIZE), b''): hda.update(block) if fl == 0: s += block if _rb.search(s): hd.update(_rb.sub(b"", s)) fl = 1 elif fl == 1: s = p + block if _re.search(s): hd.update(_re.sub(b"", s)) fl = 2 else: hd.update(p) p = block except: app.logger.error("%s (%s): Check error %s", addr, user, sys.exc_info()[1]) if os.path.exists(filename): os.unlink(filename) if os.path.exists(tempdir): os.rmdir(tempdir) abort(500) uniqid = hda.hexdigest() realid = hd.hexdigest() uniqname = uniqid + '.xml' datadir = os.path.join(app.config['DATA_FOLDER'], uniqname[0:2], uniqname[2:4]) if not os.path.exists(datadir): os.makedirs(datadir) newfilename = os.path.join(datadir, uniqname) # check batabase cur = mysql.connection.cursor() cur.execute('SELECT * FROM dumps WHERE id = %s', (uniqid, )) rv = cur.fetchall() dump = rv[0] if rv else None if dump is not None: app.logger.warning('%s (%s): Record %s already exists', addr, user, uniqname) if dump['a'] == 0: add_file(addr, user, uniqname, filename, newfilename) app.logger.warning('%s (%s): Archive %s, skipping...', addr, user, uniqname) else: add_file(addr, user, uniqname, filename, newfilename) cur.execute( 'INSERT INTO `dumps` (`id`, `crc`, `ut`, `s`, `u`) VALUES (%s, %s, %s, %s, %s)', (uniqid, realid, int(H.updateTime_ut), size, int(time.time()))) mysql.connection.commit() app.logger.info("%s (%s): %s (%s) record was added", addr, user, uniqid, realid) if os.path.exists(filename): os.unlink(filename) if os.path.exists(tempdir): os.rmdir(tempdir) return ''' <!doctype html> <title>OK</title> <h1>OK %s updateTime=%s</h1> ''' % (uniqname, int(H.updateTime_ut)) else: app.logger.error("%s (%s): Bogus filename %s", addr, user, file.filename) abort(500) return '''
# Call when an elements ends def endElement(self, tag): if tag == "node": if self.node and self.node.saveToDB == 1: #print "End of node. Node contains tourism tag with the right values, save to DB!" # save self.node class to DB try: # Do a bunch of insert statements into the db schema cursor = myDB.executeQuery( "INSERT INTO osm_nodes (lat, lon) VALUES (%s, %s)", (self.node.lat, self.node.lon)) nodeID = cursor.lastrowid for tag in self.node.tags: myDB.executeQuery( "INSERT INTO osm_tags (node_id, k, v) VALUES (%s, %s, %s)", (nodeID, tag.k, tag.v)) except Exception as e: print tag.k, tag.v, e self.node = "" if (__name__ == "__main__"): # create an XMLReader parser = xml.sax.make_parser() # turn off namepsaces parser.setFeature(xml.sax.handler.feature_namespaces, 0) # override the default ContextHandler Handler = XMLHandler() parser.setContentHandler(Handler) parser.parse("openStreetMapData.xml")