def meta_private_school_schemas(self): from ambry.client.ckan import new_ckan import re import csv from collections import defaultdict ckan = new_ckan(self.metadata.config.datarepo("default")) package = ckan.get_package(self.metadata.build.private_schools.source_package) years = set() fields = defaultdict(set) with self.session: if not self.database.exists(): self.database.create() # Foreach file listed in the CKAN package ... for r in package["resources"]: self.log("Processing: {}".format(r["name"])) m = re.search(r"(\d{4})-(\d{4})", r["name"]).groups() year = int(m[0]) file = self.filesystem.download(r["url"]) self.log(" File: {}".format(file)) # Read all of the rows and figure out the header, length and types with open(file) as f: reader = csv.reader(f) header = reader.next() # Skip header types = [] lengths = [] for row in reader: types, lengths = self.intuit_schema(row, types, lengths) # Now create schema entries type_map = {int: "integer", float: "real", str: "varchar"} try: with self.session: table_name = "private_schools_" + str(year) table = self.schema.add_table(table_name) table.add_column("id", datatype="integer", is_primary_key=True) for i, description in enumerate(header): field = self.transform_field_name(i, description) try: table.add_column( field, datatype=type_map[types[i]], width=int(lengths[i]), description=description ) except: self.error("Failed to add column {}, {}.{}".format(i, table_name, field)) self.error("Header: {}".format(header)) raise except Exception as e: self.error("Aborting load for table {}: {}".format(table_name, e)) continue
def meta_private_school_schemas(self): from ambry.client.ckan import new_ckan import re import csv from collections import defaultdict ckan = new_ckan(self.metadata.config.datarepo('default')) package = ckan.get_package(self.metadata.build.private_schools.source_package) years = set() fields = defaultdict(set) with self.session: if not self.database.exists(): self.database.create() # Foreach file listed in the CKAN package ... for r in package['resources']: self.log("Processing: {}".format(r['name'])) m = re.search(r'(\d{4})-(\d{4})', r['name']).groups() year = int(m[0]) file = self.filesystem.download(r['url']) self.log(" File: {}".format(file)) # Read all of the rows and figure out the header, length and types with open(file) as f: reader = csv.reader(f) header = reader.next() # Skip header types = [] lengths = [] for row in reader: types, lengths = self.intuit_schema(row, types,lengths) # Now create schema entries type_map = {int : "integer", float: 'real', str: 'varchar'} try: with self.session: table_name = 'private_schools_'+str(year) table = self.schema.add_table(table_name) table.add_column('id',datatype='integer', is_primary_key=True) for i,description in enumerate(header): field = self.transform_field_name(i,description) try: table.add_column(field,datatype=type_map[types[i]], width=int(lengths[i]), description=description) except: self.error("Failed to add column {}, {}.{}".format(i,table_name, field)) self.error("Header: {}".format(header)) raise except Exception as e: self.error("Aborting load for table {}: {}".format(table_name, e)) continue
def meta_get_urls(self): """Get the URLS for the CSV files from the repository, so the builder of the package does not need an account on the repo """ from ambry.client.ckan import new_ckan import re import yaml ckan = new_ckan(self.metadata.config.datarepo("default")) package = ckan.get_package(self.metadata.build.private_schools.source_package) urls = [] for r in package["resources"]: m = re.search(r"(\d{4})-(\d{4})", r["name"]).groups() year = int(m[0]) urls.append(dict(name=str(r["name"]), year=year, url=str(r["url"]))) with open(self.filesystem.path("meta", "urls.yaml"), "w") as f: f.write(yaml.dump(urls, indent=4, default_flow_style=False))
def meta_get_urls(self): '''Get the URLS for the CSV files from the repository, so the builder of the package does not need an account on the repo ''' from ambry.client.ckan import new_ckan import re import yaml ckan = new_ckan(self.metadata.config.datarepo('default')) package = ckan.get_package(self.metadata.build.private_schools.source_package) urls = [] for r in package['resources']: m = re.search(r'(\d{4})-(\d{4})', r['name']).groups() year = int(m[0]) urls.append(dict( name = str(r['name']), year = year, url = str(r['url']) )) with open(self.filesystem.path('meta','urls.yaml'), 'w') as f: f.write(yaml.dump(urls, indent=4, default_flow_style=False))