Exemple #1
0
def create(name, filepath):
    socrata = Socrata(auth)
    with open(filepath, 'rb') as csv_file:
        (revision, output) = socrata.create(name=name).csv(csv_file)

        job = revision.apply(output_schema=output)
        revision.open_in_browser()
def create_new_dataset(client: Socrata, dataframe: DataFrame, name: str,
                       description: str):
    """Create and publish a dataframe as a new Socrata dataset."""
    revision: Revision
    output_schema: OutputSchema
    revision, output_schema = client.create(
        name=name,
        description=description,
        attributionLink='https://api.census.gov').df(dataframe)
    output_schema = prepare_output_schema(output_schema)

    # Handle geometry column type
    if 'geometry' in dataframe.columns:
        geometry: Optional[Literal['points', 'polygons']]
        if len(dataframe.loc[dataframe['geometry'].fillna('').str.match(
                '^POINT')]):
            geometry = 'points'
        elif len(dataframe.loc[dataframe['geometry'].fillna('').str.match(
                '^MULTIPOLYGON')]):
            geometry = 'polygons'
        else:
            geometry = None
        output_schema = add_geometry_to_output_schema(output_schema, geometry)

    # Handle pre-1.x versions of Socrata-py
    if isinstance(output_schema, tuple):
        _, output_schema = output_schema

    output_schema.wait_for_finish()
    revision.apply(output_schema=output_schema)
    return revision
Exemple #3
0
 def wrapper(slf):
     test_name = str(method.__qualname__)
     pub = Socrata(auth)
     with open('test/fixtures/%s' % filename, 'rb') as file:
         create = pub.create(name="test for %s" % test_name,
                             description="a description")
         (revision, output) = getattr(create, kind)(file)
         try:
             method(slf, output)
         finally:
             (ok, view) = pub.views.lookup(revision.view_id())
             view.delete()
def create(name, filepath):
    socrata = Socrata(auth)
    with open(filepath, 'rb') as csv_file:
        (initial_rev, output) = socrata.create(
            name = name
        ).csv(csv_file)

        job = initial_rev.apply(output_schema = output)
        job = job.wait_for_finish()

        view = socrata.views.lookup(initial_rev.attributes['fourfour'])
        update(view)
from socrata.authorization import Authorization
from socrata import Socrata
import os
import uuid

auth = Authorization("dsmp-nbe.test-socrata.com",
                     os.environ['SOCRATA_USERNAME'],
                     os.environ['SOCRATA_PASSWORD'])

socrata = Socrata(auth)
# Just so you can run this script a lot and not have config name collisions
config_name = 'parking-config-%s' % str(uuid.uuid4())

# Create the dataset initially
with open('../files/parking.csv', 'rb') as file:
    (revision, output) = socrata.create(name="parking").csv(file)

    print("Created", revision.ui_url())

    # Create a config fr updating in the future
    (ok, config) = output.build_config(config_name, 'replace')
    assert ok, config

    print("Created configuration", config_name)

    (ok, job) = revision.apply(output_schema=output)
    assert ok, job
    # Let's wait for the upsert to finish before opening a new revision
    job.wait_for_finish()

# Update step
Exemple #6
0
class Loader:
    """A class that loads Socrata datasets for a particular env."""

    env: Literal["test", "impl", "prod"]
    files_table: Dict[str, Tuple[str, str]]
    schemas_table: Dict[str, List[str]]
    tracker_table: Dict[str, str]
    access_token: str = None
    access_token_expires: DateTime = None
    client: Socrata

    def __init__(self, env: Literal["test", "impl", "prod"]) -> None:
        """Initialize this class instance with default values."""
        logger.info(f"Initializing Loader for env {env}")
        # Set env
        self.env = env

        # Load lookup tables
        self.load_files_table()
        self.load_schemas_table()
        self.load_tracker_table()

        # Initialize Socrata client
        auth: Tuple[str] = Authorization(SOCRATA_DOMAIN, *SOCRATA_CREDENTIALS)
        self.client = Socrata(auth)

    @staticmethod
    def _request_details(r) -> None:
        """given a response, return a string showing the status code, headers, and body"""
        request_headers = "\n" + "  \n".join(f"{k}: {v}"
                                             for k, v in r.request.headers.items())
        response_headers = "\n" + "  \n".join(f"{k}: {v}"
                                              for k, v in r.headers.items())
        return f"""requested {r.request.url}
request headers:{request_headers}
request cookies: {r.request._cookies}
request body: {r.request.body}

response code: {r.status_code}
response headers:{response_headers}
body: {r.text}
"""

    def load_files_table(self) -> None:
        """Load the files table as a dict on this instance."""
        table: Dict[str, Tuple[str, str]] = {}
        with open(FILES_TABLE_PATH) as files_table_file:
            reader: Iterable[List[str]] = csv.reader(files_table_file)
            next(reader)
            for file_name, file_encoding in reader:
                table[file_name] = file_encoding
        self.files_table = table
        logger.info(f"Loaded files table: {FILES_TABLE_PATH}")

    def load_schemas_table(self) -> None:
        """Load the schemas table as a dict on this instance."""
        table_unsorted: Dict[str, Tuple[int, str]] = {}
        with open(SCHEMAS_TABLE_PATH) as schemas_table_file:
            reader: Iterable[List[str]] = csv.reader(schemas_table_file)
            next(reader)
            for file_name, column_index, column_name in reader:
                table_unsorted.setdefault(file_name, [])
                table_unsorted[file_name].append((int(column_index), column_name))

        # Sort columns by column index
        table: Dict[str, List[str]] = {}
        for file_name, columns in table_unsorted.items():
            columns_sorted: Iterable[Tuple[int, str]] = sorted(columns, key=itemgetter(0))
            column_names: List[str] = [column_name for column_index, column_name in columns_sorted]
            table[file_name] = column_names
        self.schemas_table = table
        logger.info(f"Loaded schemas table: {SCHEMAS_TABLE_PATH}")

    def load_tracker_table(self) -> None:
        """Load the tracker table as a dict on this instance."""
        with open(TRACKER_TABLE_PATH) as tracker_table_file:
            table: Dict[str, str] = json.load(tracker_table_file)
        self.tracker_table = table
        logger.info(f"Loaded tracker table: {TRACKER_TABLE_PATH}")

    def update_tracker_table(self) -> None:
        """Update the tracker table and write it back to disk."""
        logger.info(f"Updating tracker table: {TRACKER_TABLE_PATH}")
        with open(TRACKER_TABLE_PATH, "w") as tracker_table_file:
            json.dump(self.tracker_table, tracker_table_file, indent=2, sort_keys=True)
        logger.info(f"Updated tracker table: {TRACKER_TABLE_PATH}")

    def fetch_access_token(self) -> None:
        """Fetch an access token to obtain Plan Finder data."""
        # Construct request
        url: str = TOKEN_URLS[self.env]
        username: str = USERNAMES[self.env]
        key_id, key_secret = API_KEYS[self.env]
        body = {
            "userName": username,
            "scopes": "mpfpe_pde_full",
            "keyId": key_id,
            "keySecret": key_secret,
        }
        params = {}
        if self.env in ACS_PARAMS:
            params["ACS"] = ACS_PARAMS[self.env]

        # Submit HTTP POST request to obtain token
        logger.info(f"Fetching {self.env} access token")
        response: Response = requests.post(url, json=body, params=params)
        if response.status_code != 200:
            logger.error(Loader._request_details(response))
            raise RuntimeError(f"Failed to fetch token: HTTP status {response.status_code}")

        # Extract token from response
        response_json: dict = response.json()
        access_token: str = response_json["accessToken"]
        expires: int = response_json["expires"]
        self.access_token = access_token
        self.access_token_expires = DateTime.now() + Duration(seconds=expires)
        logger.info(f"Fetched {self.env} access token; expires {self.access_token_expires}")

    def fetch_zip_file(self, plan_year: str, date: Date = Date.today()) -> Path:
        """Download a Plan Finder zip file for a given date."""
        # If we don't have a current access token, fetch one
        no_access_token = self.access_token is None
        if no_access_token or DateTime.now() > (self.access_token_expires - Duration(minutes=5)):
            self.fetch_access_token()

        # Construct request
        url = DATA_URL
        headers = {
            "X-API-CONSUMER-ID": API_KEYS[self.env][0],
            "Authorization": f"Bearer {self.access_token}",
        }
        params = {"fileName": f"{plan_year}_{date.to_date_string()}"}

        # Submit GET request to download file
        logger.info(f"Fetching {self.env} zip file for plan year {plan_year} and date {date}")
        response = requests.get(url, headers=headers, params=params)
        if not response.status_code == 200:
            raise RuntimeError(
                "Failed to fetch zip file (this may be expected for dates with no data): HTTP "
                f"status {response.status_code}"
            )

        # Save zip file to disk and return its path
        zip_bytes: bytes = response.content
        zip_file_path = DATA_DIR_PATH / f"{self.env}_{date}.zip"
        with open(zip_file_path, "wb") as zip_file:
            zip_file.write(zip_bytes)
        logger.info(f"Fetched {self.env} zip file: {zip_file_path}")
        return zip_file_path

    def unzip_zip_file(self, zip_file_path: Path) -> Path:
        """Unzip a zip file on disk.

        Because Plan Finder zip files throw errors when trying to unzip
        with a newer zip utility such as Python's built-in zipfile
        module, we need to use subprocess.run to call the unzip
        executable as a workaround.
        """
        extract_dir_path: Path = zip_file_path.parent / zip_file_path.stem
        logger.info(f"Unzipping zip file: {zip_file_path}")
        result: subprocess.CompletedProcess = subprocess.run(
            ["unzip", str(zip_file_path), "-d", str(extract_dir_path)]
        )
        if result.returncode == 0:
            logger.info(f"Unzipped zip file {zip_file_path}: {extract_dir_path}")
            return extract_dir_path
        else:
            raise RuntimeError(f"Failed to unzip {zip_file_path}: exit code {result.returncode}")

    def load_dataframe(self, data_file_path: Path) -> pd.DataFrame:
        """Load a pandas DataFrame object for a Plan Finder data file.

        This is the most robust/simple way of handling the various
        encoding and schema issues associated with Plan Finder data.
        """
        # Look up the schema (column indices and names) matching this file
        try:
            file_encoding = self.files_table[data_file_path.name]
        except KeyError:
            raise KeyError(
                f"Failed to find encoding for {data_file_path.name} in {FILES_TABLE_PATH}"
            )

        # Look up column names from schemas table
        try:
            column_names: List[str] = self.schemas_table[data_file_path.name]
        except KeyError:
            raise KeyError(
                f"Failed to find schema for {data_file_path.name} in {SCHEMAS_TABLE_PATH}"
            )

        # Load file as a dataframe using the column names and encoding we identified
        dataframe: pd.DataFrame = pd.read_csv(
            data_file_path, names=column_names, encoding=file_encoding, delimiter="\t", dtype=str
        )
        return dataframe

    def prepare_output_schema(self, output_schema: OutputSchema) -> OutputSchema:
        """Set all columns to text within a Socrata-py OutputSchema."""
        columns: List[dict] = output_schema.attributes["output_columns"]
        for column in columns:
            # Extract both field name and transform expression version of field name
            column_name: str = column["field_name"]
            transform_expr: str = column["transform"]["transform_expr"]
            column_name_match: re.Match = re.search(r"`[^`]+`", transform_expr)
            transform_column_name: str = column_name_match.group(0)
            transform = f"to_text({transform_column_name})"
            output_schema = output_schema.change_column_transform(column_name).to(transform)
        changed_output_schema: OutputSchema = output_schema.run()
        return changed_output_schema

    def is_old_hanging_draft(self, revision: dict) -> bool:
        """Determine whether a draft is at least a day old and unclosed."""
        draft_created_at: DateTime = pendulum.parse(revision["resource"]["created_at"])
        is_old_draft: bool = DateTime.now() - draft_created_at > Duration(days=1)
        is_hanging_draft: bool = revision["resource"]["closed_at"] is None
        return is_old_draft and is_hanging_draft

    def delete_old_hanging_drafts(self, dataset_id: str) -> None:
        """Delete any old hanging drafts for a given dataset.

        Checks the dataset's revision history for unclosed drafts that
        are at least a day old, and deletes any that are found.
        """
        # Check for hanging drafts
        auth: Tuple[str, str] = (self.client.auth.username, self.client.auth.password)
        domain_url = f"https://{self.client.auth.domain}".rstrip("/")
        revisions_url = f"{domain_url}/api/publishing/v1/revision/{dataset_id}"
        logger.info(f"Checking for old hanging drafts for dataset {dataset_id}…")
        response: Response = requests.get(revisions_url, auth=auth)
        try:
            response_json: List[dict] = response.json()
            old_hanging_drafts: Iterable[dict] = filter(self.is_old_hanging_draft, response_json)
            discard_urls: Iterable[str] = map(
                lambda revision: revision["links"]["discard"], old_hanging_drafts
            )
        except Exception:
            logger.exception(
                f"Failed when trying to delete old hanging drafts for dataset {dataset_id}"
            )
            return

        # Delete hanging drafts in turn
        for discard_url in discard_urls:
            delete_url = f"{domain_url}{discard_url}"
            logger.info(f"Deleting old hanging draft: {delete_url}")
            delete_response: Response = requests.delete(delete_url, auth=auth)
            delete_status: int = delete_response.status_code
            logger.info(f"Obtained HTTP {delete_status} response: {delete_url}")

    def create_dataset(self, data_file_path: Path, date: Date) -> str:
        """Create a new dataset on Socrata from a data file."""
        # Load dataframe
        dataframe: pd.DataFrame = self.load_dataframe(data_file_path)
        dataset_name = f"{data_file_path.stem} [{self.env}]"
        description = f"Plan Finder dataset {dataset_name}, released on {date}."

        # Create new dataset on Socrata, set all columns as text, and publish
        logger.info(f"Creating new dataset on Socrata: {data_file_path}")
        revision: Revision
        output_schema: OutputSchema
        revision, output_schema = self.client.create(
            name=dataset_name, description=description, category="Plan Finder", tags=[self.env]
        ).df(dataframe)
        revision = revision.update({"action": {"permission": SOCRATA_DATASET_PERMISSION}})
        dataset_id: str = revision.attributes["fourfour"]
        output_schema.wait_for_finish()
        output_schema = self.prepare_output_schema(output_schema)
        output_schema.wait_for_finish()
        job: Job = revision.apply()
        job.wait_for_finish()
        logger.info(f"Created dataset: {dataset_id}")
        return dataset_id

    @retry(
        wait=wait_random_exponential(multiplier=1, max=30),
        stop=stop_after_attempt(2),
        reraise=True,
    )
    def update_dataset(self, dataset_id: str, data_file_path: Path, date: Date) -> str:
        """Update an existing dataset on Socrata from a data file.

        If a failure occurs when updating, this function will attempt a single
        retry; if the failure persists on retry, the exception will be caught
        and logged in Loader.update_all_datasets.
        """
        # Delete old hanging drafts for this dataset (e.g. from previous failures)
        self.delete_old_hanging_drafts(dataset_id)

        # Load dataframe
        dataframe: pd.DataFrame = self.load_dataframe(data_file_path)
        dataset_name = f"{data_file_path.stem} [{self.env}]"
        description = f"Plan Finder dataset {dataset_name}, released on {date}."

        # Create replace revision on Socrata and publish
        logger.info(f"Updating dataset {dataset_id} on Socrata: {data_file_path}")
        view: View = self.client.views.lookup(dataset_id)
        revision: Revision = view.revisions.create_replace_revision(
            metadata={"description": description}, permission=SOCRATA_DATASET_PERMISSION
        )
        upload: Source = revision.create_upload(data_file_path.name)
        source: Source = upload.df(dataframe)
        source.wait_for_finish()
        output_schema: OutputSchema = source.get_latest_input_schema().get_latest_output_schema()
        output_schema.wait_for_finish()
        revision.apply(output_schema=output_schema)
        logger.info(f"Updated dataset: {dataset_id}")
        return dataset_id

    def delete_dataset(self, dataset_id: str) -> bool:
        """Delete an existing dataset on Socrata."""
        logger.info(f"Deleting dataset {dataset_id} on Socrata")
        auth: Tuple[str, str] = (self.client.auth.username, self.client.auth.password)
        domain_url = f"https://{self.client.auth.domain}".rstrip("/")
        dataset_url = f"{domain_url}/api/views/{dataset_id}"
        response: Response = requests.delete(dataset_url, auth=auth)
        response.raise_for_status()
        logger.info(f"Deleted dataset: {dataset_id}")
        return True

    def create_all_datasets(self, plan_year: str, date: Date = Date.today(), only_untracked: bool = False) -> None:
        """Create Socrata datasets for Plan Finder data for a given date."""
        logger.info(f"Creating all {self.env} datasets on Socrata for {date}")
        self.load_tracker_table()
        if only_untracked is not True and any(self.tracker_table[self.env].values()):
            raise RuntimeError("Please delete all datasets in tracker before creating new ones")

        with create_data_dir():
            # Fetch and unzip zip file for this env and date
            zip_file_path: Path = self.fetch_zip_file(plan_year, date)
            extract_dir_path: Path = self.unzip_zip_file(zip_file_path)

            # Iterate over all files in newly unzipped directory
            data_file_paths: Iterable[Path] = extract_dir_path.glob("*")
            for data_file_path in data_file_paths:
                # Skip files that are not in the files table
                if data_file_path.name not in self.files_table:
                    continue

                # When only_untracked is True, skip files for which a dataset is already tracked
                dataset_is_tracked = (
                    self.tracker_table[self.env].get(data_file_path.name) is not None
                )
                if only_untracked is True and dataset_is_tracked:
                    continue

                # Create new dataset, skipping this one if we get an error
                try:
                    dataset_id: str = self.create_dataset(data_file_path, date)
                except Exception:
                    logger.exception(f"Failed to create dataset: {data_file_path}")
                    continue
                else:
                    # Add new dataset identifier (4x4) to tracker
                    self.tracker_table[self.env][data_file_path.name] = dataset_id
            logger.info(f"Finished creating {self.env} datasets for {date}")
            self.update_tracker_table()

    def update_all_datasets(
            self, plan_year: str, date: Date = Date.today(), only_file: Optional[str] = None
    ) -> None:
        """Update existing Socrata Plan Finder datasets for a given date."""
        logger.info(f"Updating {self.env} datasets on Socrata for {date}")
        self.load_tracker_table()
        with create_data_dir():
            # Fetch and unzip zip file for this env and date
            zip_file_path: Path = self.fetch_zip_file(plan_year, date)
            extract_dir_path: Path = self.unzip_zip_file(zip_file_path)

            # Iterate over all existing datasets in tracker
            failed_updates: List[Tuple[str, str]] = []
            for file_name, dataset_id in self.tracker_table[self.env].items():
                data_file_path = extract_dir_path / file_name

                # If only_file is specified, skip files not matching the supplied filename
                if only_file is not None and data_file_path.name != only_file:
                    continue

                # Skip files not included in this date's release or not in files table
                if not data_file_path.exists() or data_file_path.name not in self.files_table:
                    continue

                # Update dataset, skipping this one if we get an error
                try:
                    dataset_id: str = self.update_dataset(dataset_id, data_file_path, date)
                except Exception:
                    logger.exception(f"Failed to update dataset {dataset_id}: {data_file_path}")
                    failed_updates.append((file_name, dataset_id))
                    continue
            logger.info(
                f"Finished updating {self.env} datasets for {date}; {len(failed_updates)} failures"
            )
            for file_name, dataset_id in failed_updates:
                logger.info(f"Failed to update {file_name} [{self.env}] ({dataset_id})")

    def delete_all_datasets(self) -> None:
        """Delete all existing Plan Finder datasets on Socrata."""
        logger.info(f"Deleting all {self.env} datasets on Socrata")
        self.load_tracker_table()

        # Iterate over all existing datasets in tracker
        for file_name, dataset_id in self.tracker_table[self.env].items():
            # Skip blank dataset IDs
            if not dataset_id:
                continue

            # Update dataset, skipping this one if we get an error
            try:
                self.delete_dataset(dataset_id)
            except Exception:
                logger.exception(f"Failed to delete dataset {dataset_id}")
                continue
            else:
                self.tracker_table[self.env][file_name] = ""
        logger.info(f"Finished deleting {self.env} datasets")
        self.update_tracker_table()
Exemple #7
0
from examples.auth import authorization
from socrata import Socrata
from prettytable import PrettyTable

socrata = Socrata(authorization)

with open('files/Seattle-Neighborhoods.zip', 'rb') as file:
    # Let's make a socrata view, open a revision on it, and then
    # upload and validate our data
    (revision,
     output_schema) = socrata.create(name="Seattle neighborhoods",
                                     description="a shapefile").shapefile(file)

    (ok, output_schema) = output_schema.wait_for_finish()
    assert ok, output_schema

    (ok, job) = revision.apply(output_schema=output_schema)
    assert ok, job

    revision.open_in_browser()
from examples.auth import authorization
from socrata import Socrata

socrata = Socrata(authorization)

with open('files/Sold_Fleet_Equipment.csv', 'rb') as file:
    # Let's make a socrata view, open a revision on it, and then
    # upload and validate our data
    (revision, output_schema) = socrata.create(
        name="cool dataset", description="~~my first dataset~~").csv(file)

    (ok, output_schema) = output_schema.wait_for_finish()
    assert ok, output_schema

    (ok, job) = revision.apply(output_schema=output_schema)
    assert ok, job

    revision.open_in_browser()
from examples.auth import authorization
from socrata import Socrata

socrata = Socrata(authorization)

file_path = sys.argv[1]
"""
This shows reprojecting from British National Grid
to WGS84

We're using the proj4 def from here:
http://spatialreference.org/ref/epsg/27700/
"""

with open(file_path, 'rb') as file:
    (revision, output_schema) = socrata.create(name="parking structures",
                                               description="cool").csv(file)


    (ok, output_schema) = output_schema\
        .add_column(
            'point_wgs84',
            'Location',
            """
            reproject_to_wgs84(
                set_projection(
                    make_point(
                        to_number(`northing`),
                        to_number(`easting`)
                    ),
                    "+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +datum=OSGB36 +units=m +no_defs"
                )
Exemple #10
0
from examples.auth import authorization
from socrata import Socrata
import gntp.notifier

socrata = Socrata(authorization)

with open('files/Permits.csv', 'rb') as file:
    # Let's make a socrata view, open a revision on it, and then
    # upload and validate our data
    (revision, output_schema) = socrata.create(
        name="using a config", description="~~my first dataset~~").csv(file)

    (ok, output_schema) = output_schema.wait_for_finish()
    assert ok, output_schema

    revision.open_in_browser()

    print("Click on the 'Review Data' button to view the output schema")

    print(
        "Then click on the Address column's dropdown and click 'Use as georeference', and add a geocoded column"
    )

    print("Maybe you also want to change types of columns")

    _ = input("Click 'Save' in the UI and then hit enter to continue\n>>> ")

    # Get the output schema that is currently set
    (ok, output_schema) = revision.get_output_schema()
    assert ok, output_schema