Esempio n. 1
0
    def meta_private_school_schemas(self):

        from ambry.client.ckan import new_ckan
        import re
        import csv
        from collections import defaultdict

        ckan = new_ckan(self.metadata.config.datarepo("default"))
        package = ckan.get_package(self.metadata.build.private_schools.source_package)

        years = set()
        fields = defaultdict(set)

        with self.session:
            if not self.database.exists():
                self.database.create()

        # Foreach file listed in the CKAN package ...
        for r in package["resources"]:
            self.log("Processing: {}".format(r["name"]))
            m = re.search(r"(\d{4})-(\d{4})", r["name"]).groups()
            year = int(m[0])

            file = self.filesystem.download(r["url"])

            self.log("    File: {}".format(file))

            # Read all of the rows and figure out the header, length and types
            with open(file) as f:
                reader = csv.reader(f)
                header = reader.next()  # Skip header
                types = []
                lengths = []
                for row in reader:
                    types, lengths = self.intuit_schema(row, types, lengths)

            # Now create schema entries
            type_map = {int: "integer", float: "real", str: "varchar"}

            try:
                with self.session:
                    table_name = "private_schools_" + str(year)
                    table = self.schema.add_table(table_name)
                    table.add_column("id", datatype="integer", is_primary_key=True)

                    for i, description in enumerate(header):

                        field = self.transform_field_name(i, description)

                        try:
                            table.add_column(
                                field, datatype=type_map[types[i]], width=int(lengths[i]), description=description
                            )
                        except:
                            self.error("Failed to add column {}, {}.{}".format(i, table_name, field))
                            self.error("Header: {}".format(header))
                            raise
            except Exception as e:
                self.error("Aborting load for table {}: {}".format(table_name, e))
                continue
Esempio n. 2
0
    def meta_private_school_schemas(self):
        
        from ambry.client.ckan import new_ckan
        import re
        import csv
        from collections import defaultdict
        
        ckan = new_ckan(self.metadata.config.datarepo('default'))
        package = ckan.get_package(self.metadata.build.private_schools.source_package)

        years = set()
        fields = defaultdict(set)

        with self.session:
            if not self.database.exists():
                self.database.create()

        # Foreach file listed in the CKAN package ... 
        for r in package['resources']:
            self.log("Processing: {}".format(r['name']))
            m = re.search(r'(\d{4})-(\d{4})', r['name']).groups()
            year = int(m[0])

            file = self.filesystem.download(r['url'])

            self.log("    File: {}".format(file))

            # Read all of the rows and figure out the header, length and types
            with open(file) as f:
                reader = csv.reader(f)
                header = reader.next() # Skip header
                types = []
                lengths = []
                for row in reader:
                    types, lengths  = self.intuit_schema(row, types,lengths)
            
            # Now create schema entries 
            type_map = {int : "integer", float: 'real', str: 'varchar'}

            try:
                with self.session:
                    table_name = 'private_schools_'+str(year)
                    table  = self.schema.add_table(table_name)
                    table.add_column('id',datatype='integer', is_primary_key=True)

                    for i,description in enumerate(header):

                        field = self.transform_field_name(i,description)

                        try:
                            table.add_column(field,datatype=type_map[types[i]], 
                                width=int(lengths[i]), description=description)
                        except:
                            self.error("Failed to add column {}, {}.{}".format(i,table_name, field))
                            self.error("Header: {}".format(header))
                            raise
            except Exception as e:
                self.error("Aborting load for table {}: {}".format(table_name, e))
                continue
Esempio n. 3
0
    def meta_get_urls(self):
        """Get the URLS for the CSV files from the repository, so the builder
        of the package does not need an account on the repo """
        from ambry.client.ckan import new_ckan
        import re
        import yaml

        ckan = new_ckan(self.metadata.config.datarepo("default"))
        package = ckan.get_package(self.metadata.build.private_schools.source_package)

        urls = []

        for r in package["resources"]:

            m = re.search(r"(\d{4})-(\d{4})", r["name"]).groups()
            year = int(m[0])

            urls.append(dict(name=str(r["name"]), year=year, url=str(r["url"])))

        with open(self.filesystem.path("meta", "urls.yaml"), "w") as f:
            f.write(yaml.dump(urls, indent=4, default_flow_style=False))
Esempio n. 4
0
    def meta_get_urls(self):
        '''Get the URLS for the CSV files from the repository, so the builder
        of the package does not need an account on the repo '''
        from ambry.client.ckan import new_ckan
        import re
        import yaml
        ckan = new_ckan(self.metadata.config.datarepo('default'))
        package = ckan.get_package(self.metadata.build.private_schools.source_package)
        
        urls = []
        
        for r in package['resources']:

            m = re.search(r'(\d{4})-(\d{4})', r['name']).groups()
            year = int(m[0])

            urls.append(dict(
                name = str(r['name']),
                year = year,
                url = str(r['url'])
            ))

        with open(self.filesystem.path('meta','urls.yaml'), 'w') as f:
            f.write(yaml.dump(urls, indent=4, default_flow_style=False))