def query(region, pollutant, year): """ """ region_code = regions_dict.get_pk(region) region_name = regions_dict.get_name(region) pollutant_code = pollutants_dict.get_pk(pollutant) pollutant_formula = pollutants_dict.get_formula(pollutant) # pollutant_name = pollutants_dict.get_name(pollutant) # name = '%sdownload/%s_%s_%s.zip' % ( # URL_PREFIX, # region_name.upper(), # pollutant_formula.upper(), # year, # ) query = urllib.urlencode({ 'p_comp': pollutant_code, 'p_comp_name': pollutant_formula.upper(), 'p_reg': region_code, 'p_reg_name': region_name.upper(), 'p_anno': year, }) genfile = "%(prefix)s/servlet/zipper?%(query)s" % { 'prefix': URL_PREFIX, 'query': query, } link = download(genfile) if link is None: raise IOError("Could not fetch '%s'." % genfile) soup = BeautifulSoup(link) location = \ soup.find('script').contents[0].split('"')[1].\ replace("../download/", "") link.close() archive = download( "%(prefix)s/download/%(location)s" % { 'prefix': URL_PREFIX, 'location': location }) if archive is None: raise IOError("Could not fetch '%s'." % genfile) return archive
def __call__(self): if '.' not in self._out: self._out = self._out + ".zip" # disk cleanup if (os.path.exists(self._tmpdir)): shutil.rmtree(self._tmpdir, True) # TODO add something for errors os.mkdir(self._tmpdir) azip = zipfile.ZipFile(self._out, "w" ) # write output to file fullpath = os.path.join(self._tmpdir, "brace.xml") xml = open(fullpath, "wt") xml.write(build_dspl_xml()) xml.close() azip.write(fullpath) # write aggregates csv file fullpath = os.path.join(self._tmpdir, "aggregates.csv") aggrcsv = open(fullpath, "wt") aggrcsv.write("aggregate, description\n") for (id, desc) in [ ( "max", "Maximum daily concentration" ), ( "avg", "Average daily concentration" ), ]: entry = u"%(id)s, %(description)s\n" % { 'id': id, 'description': desc, } aggrcsv.write(entry) aggrcsv.close() azip.write(fullpath) # write regions csv file fullpath = os.path.join(self._tmpdir, "regions.csv") regcsv = open(fullpath, "wt") regcsv.write("region, name, latitude, longitude\n") for r in opts_mgr.regions: entry = u"%(region)s, %(region)s, %(latitude)s, %(longitude)s\n" % { 'region': regions_dict.get_name(r), 'latitude': regions_dict.get_latitude(r), 'longitude': regions_dict.get_longitude(r), } regcsv.write(entry) regcsv.close() azip.write(fullpath) # write stations csv file fullpath = os.path.join(self._tmpdir, "stations.csv") stscsv = open(fullpath, "wt") stscsv.write("station, name, region, latitude, longitude\n") for (regcode, name, latitude, longitude) in stations_dict.all(): entry = u"%(station)s, %(station)s, %(region)s, %(latitude)s, %(longitude)s\n" % { 'station': name, 'region': regions_dict.get_name(regcode), 'latitude': latitude, 'longitude': longitude, } stscsv.write(entry) stscsv.close() azip.write(fullpath) # write pollutants csv file fullpath = os.path.join(self._tmpdir, "pollutants.csv") csv = open(fullpath, "wt") csv.write("pollutant, description\n") for (_, formula, description) in pollutants_dict.all(): entry = u"%(formula)s, %(description)s\n" % { 'formula': formula, 'description': escape(description), } csv.write(entry) csv.close() azip.write(fullpath) # write pollutants csv files for slice tables # Remark: as csv file *must* be sorted according to dimensions # it is necessary to build two separate temp files and then # join them together when every row has been processed. :-/ data_fullpath = os.path.join(self._tmpdir, "data.csv") data_csv = open(data_fullpath, "wt") data_csv.write("region, station, aggregate, pollutant, day, measurement\n") max_file = tempfile.TemporaryFile() avg_file = tempfile.TemporaryFile() # generate aggregated data for (region, station, day, pollutant, max_, avg_) in self._yield(): formula = pollutants_dict.get_formula(pollutant) entry = u"%(region)s, %(station)s, max, %(formula)s, %(day)s, %(qty).3f\n" % { 'region': region, 'station': station, 'formula': formula, 'day': time.strftime("%Y-%m-%d", day +(0, ) * 6), 'qty': max_, } max_file.write(entry) entry = u"%(region)s, %(station)s, avg, %(formula)s, %(day)s, %(qty).3f\n" % { 'region': region, 'station': station, 'formula': formula, 'day': time.strftime("%Y-%m-%d", day +(0, ) * 6), 'qty': avg_, } avg_file.write(entry) # concatenate max_file and avg_file files max_file.seek(0) for l in max_file: data_csv.write(l) max_file.close() # get rid of temp file avg_file.seek(0) for l in avg_file: data_csv.write(l) avg_file.close() # get rid of temp file data_csv.close() azip.write(data_fullpath) # disk cleanup if (os.path.exists(self._tmpdir)): shutil.rmtree(self._tmpdir, True) # TODO add something for errors
def __call__(self, args): opts, args = getopt.getopt(args, "", self.long_options) for o, a in opts: if o == "--year": year = int(a) if year < DEFAULT_FROM_YEAR: raise getopt.GetoptError( "No data available before %d" % DEFAULT_FROM_YEAR) self.from_year = year logger.debug("Setting starting year to %d", year) self.to_year = year logger.debug("Setting ending year to %d", year) elif o == "--from": from_year = int(a) if from_year < DEFAULT_FROM_YEAR: raise getopt.GetoptError( "No data available before %d" % DEFAULT_FROM_YEAR) self.from_year = from_year logger.debug("Setting starting year to %d", from_year) elif o == "--to": to_year = int(a) if DEFAULT_TO_YEAR < to_year: raise getopt.GetoptError( "No data available after %d" % DEFAULT_TO_YEAR) self.to_year = to_year logger.debug("Setting ending year to %d", to_year) elif o == "--region": region_name = regions_dict.get_name(a) region_code = regions_dict.get_pk(a) self.regions.append(region_code) logger.debug("Adding region '%s'", region_name) elif o == '--pollutant': pollutant_formula = pollutants_dict.get_formula(a) pollutant_name = pollutants_dict.get_name(a) pollutant_code = pollutants_dict.get_pk(a) self.pollutants.append(pollutant_code) logger.debug("Adding pollutant '%s' (%s)", pollutant_formula, pollutant_name) elif o == "--verbosity": level = int(a) self.verbosity = level if level == 0: logger.setLevel(logging.ERROR) elif level == 1: logger.setLevel(logging.WARNING) elif level == 2: logger.setLevel(logging.INFO) elif level == 3: logger.setLevel(logging.DEBUG) else: assert False, "Unsupported verbosity level" logger.debug("Setting verbosity level to %s", ["ERROR", "WARNING", "INFO", "DEBUG"][level]) elif o == "--keep": self.keep = True if self.local: raise getopt.GetoptError( "--local and --keep are not supported together") elif o == "--local": self.local = True if self.keep: raise getopt.GetoptError( "--local and --keep are not supported together") elif o == "--help": print usage sys.exit() else: assert False, "unhandled option"
%(pollutants)s --help, prints this message. --verbosity=<level>, adjusts the level of verbosity of the tool. This is a number between 0(quiet) and 3(extremely verbose). This is manly for debugging purposes. --format=<format>, determines the output format for the dataset. Currently the tool supports only DSPL (Dataset publishing Language). arguments: filename, the filename to write the output to. """ % { 'regions': "\n ".join ([regions_dict.get_name(r[0]) for r in regions_dict.all()]), 'pollutants': "\n ".join ([ "%(formula)s (%(name)s)" % { 'formula': pollutants_dict.get_formula(p[0]), 'name': pollutants_dict.get_name(p[0]), } for p in pollutants_dict.all() ]), } class OptionsManager(object): """Provides options management. """ long_options = [ "help",
r[0] for r in regions_dict.all()] # main body if __name__ == "__main__": data_mgr = DataManager() # Phase 1. Fetch data total_rows = 0 for pollutant_code in opts_mgr.pollutants: pollutant_formula = pollutants_dict.get_formula(pollutant_code) pollutant_name = pollutants_dict.get_name(pollutant_code) for region_code in opts_mgr.regions: region_name = regions_dict.get_name(region_code) for year in range(opts_mgr.from_year, 1 + opts_mgr.to_year): if not opts_mgr.local: # fetch remote archive logger.info( "Trying to fetch data for year %d, pollutant '%s' (%s), " "region '%s'", year, pollutant_formula, pollutant_name, region_name) archive = query(region_code, pollutant_code, year) else: # use local archive logger.info(