コード例 #1
0
def upload_to_socrata(**kwargs):
    from sodapy import Socrata

    # make a sodapy Socrata client
    s = Socrata("data.detroitmi.gov", Variable.get("SOCRATA_TOKEN"),
                Variable.get("SOCRATA_USER"), Variable.get("SOCRATA_PASS"))

    # get the SQLAlchemy engine
    hook = PostgresHook('etl_postgres')
    eng = hook.get_sqlalchemy_engine()

    # get the payload
    result = eng.execute(f"select * from {kwargs['table']}")
    payload = [dict(row) for row in result]

    if kwargs['method'] == 'replace':
        job = s.replace(kwargs['id'], payload)
    else:
        chunk_size = 10000
        for i in range(0, len(payload), chunk_size):
            try:
                r = s.upsert(kwargs['id'], payload[i:i + chunk_size])
            except:
                print(f"Error on record {i}")
                r = s.upsert(kwargs['id'], payload[i:i + chunk_size])
コード例 #2
0
 def send_crab_data(self):
     CONVERSION_RATE = .00000701549
     oocytes = Oocyte.objects.filter(crab=self).filter(chosen_count=10)
     client = Socrata("noaa-fisheries-afsc.data.socrata.com", "q3DhSQxvyWbtq1kLPs5q7jwQp",  username="******", password = "******")
     data = {'area_2': '',
              'area_5': '', 
              'calibration_5x': 0.00028, 
              'area_4': '', 
              'area_7': '', 
              'area_10': '', 
              'calibration_10x': 0.00056, 
              'area_9': '', 
              'year': '', 
              'sample': '', 
              'area_3': '', 
              'area_8': '', 
              'area_1': '', 
              'area_6': ''}
     data['area_1'] = oocytes[0].area * CONVERSION_RATE
     data['area_2'] = oocytes[1].area * CONVERSION_RATE
     data['area_3'] = oocytes[2].area * CONVERSION_RATE
     data['area_4'] = oocytes[3].area * CONVERSION_RATE
     data['area_5'] = oocytes[4].area * CONVERSION_RATE
     data['area_6'] = oocytes[5].area * CONVERSION_RATE
     data['area_7'] = oocytes[6].area * CONVERSION_RATE
     data['area_8'] = oocytes[7].area * CONVERSION_RATE
     data['area_9'] = oocytes[8].area * CONVERSION_RATE
     data['area_10'] = oocytes[9].area * CONVERSION_RATE
     data['year'] = datetime.datetime.now().year
     data['sample'] = self.sample_num
     payload = [data]
     client.upsert("km2u-hwjw", payload)
コード例 #3
0
ファイル: test_soda.py プロジェクト: Angiezhao/sodapy
def test_upsert_exception():
    mock_adapter = {}
    mock_adapter["prefix"] = PREFIX
    adapter = requests_mock.Adapter()
    mock_adapter["adapter"] = adapter
    client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter)

    response_data = "403_response_json.txt"
    setup_mock(adapter, "POST", response_data, 403, reason="Forbidden")

    data = [{"theme": "Surfing", "artist": "Wavves",
             "title": "King of the Beach", "year": "2010"}]
    try:
        client.upsert(DATASET_IDENTIFIER, data)
    except Exception as e:
        assert isinstance(e, requests.exceptions.HTTPError)
    else:
        raise AssertionError("No exception raised for bad request.")
コード例 #4
0
    def upsert_to_socrata(client: Socrata, dataset_identifier: str,
                          zipper: dict) -> None:
        """
        Upsert data to Socrata dataset.

        :param client: Socrata connection client
        :param dataset_identifier: Unique Socrata dataset identifier. Not the data page identifier but primary page id.
        :param zipper: dictionary of zipped results (headers and data values)
        :return: None
        """
        try:
            client.upsert(dataset_identifier=dataset_identifier,
                          payload=zipper,
                          content_type='json')
        except Exception as e:
            print("Error upserting to Socrata: {}. {}".format(
                dataset_identifier, e))
        return
コード例 #5
0
def lambda_handler(event, context):
    '''
    Method called by Amazon Web Services when the lambda trigger fires. This lambda
    is configured to be triggered by file creation in the ITS DataHub
    Sandbox s3 bucket ("usdot-its-cvpilot-public-data" or "test-usdot-its-cvpilot-public-data").
    When a new file is added to the Sandbox s3 bucket, this lambda function will read
    the new JSON newline file, perform data transformation, upsert the new data
    records to the corresponding Socrata data set on data.transportation.gov, and remove the
    oldest records from the Socrata dataset to keep the data set at a manageable size.

    Parameters:
    	event, context: Amazon Web Services required parameters. Describes triggering event.
    '''
    # Read data from the newly deposited file and
    # perform data transformation on the records
    out_recs = []
    for bucket, key in lambda_to_socrata_util.get_fps_from_event(event):
        raw_recs = lambda_to_socrata_util.process_s3_file(bucket, key)
        for raw_rec in raw_recs:
            split_raw_recs = split_tim_tdf(raw_rec)
            processed_raw_recs = [process_tim(i) for i in split_raw_recs]
            out_recs += processed_raw_recs

    if len(out_recs) == 0:
        logger.info("No new data found. Exit script")
        return

    # Upsert the new records to the corresponding Socrata data set
    logger.info("Connecting to Socrata")
    client = Socrata("data.transportation.gov",
                     SOCRATA_API_KEY,
                     SOCRATA_USERNAME,
                     SOCRATA_PASSWORD,
                     timeout=400)

    logger.info("Transform record dtypes according to Socrata dataset")
    col_dtype_dict = lambda_to_socrata_util.get_col_dtype_dict(
        client, SOCRATA_DATASET_ID)
    float_fields = ['randomNum', 'metadata_generatedAt_timeOfDay']
    out_recs = [
        lambda_to_socrata_util.mod_dtype(r, col_dtype_dict, float_fields)
        for r in out_recs
    ]

    logger.info("Uploading {} new records".format(len(out_recs)))
    uploadResponse = client.upsert(SOCRATA_DATASET_ID, out_recs)
    logger.info(uploadResponse)
コード例 #6
0
ファイル: test_soda.py プロジェクト: Angiezhao/sodapy
def test_upsert():
    mock_adapter = {}
    mock_adapter["prefix"] = PREFIX
    adapter = requests_mock.Adapter()
    mock_adapter["adapter"] = adapter
    client = Socrata(DOMAIN, APPTOKEN, username=USERNAME, password=PASSWORD,
                     session_adapter=mock_adapter)

    response_data = "upsert_songs.txt"
    data = [{"theme": "Surfing", "artist": "Wavves",
             "title": "King of the Beach", "year": "2010"}]
    setup_mock(adapter, "POST", response_data, 200)
    response = client.upsert(DATASET_IDENTIFIER, data)

    assert isinstance(response, dict)
    assert response.get("Rows Created") == 1
    client.close()
コード例 #7
0
ファイル: test_soda.py プロジェクト: orio33/sodapy
def test_upsert_exception():
    mock_adapter = {}
    mock_adapter["prefix"] = PREFIX
    adapter = requests_mock.Adapter()
    mock_adapter["adapter"] = adapter
    client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter)

    path = "/songs.json"
    response_data = "403_response_json.txt"
    set_up_mock(adapter, "POST", response_data, 403, reason="Forbidden")

    data = [{"theme": "Surfing", "artist": "Wavves",
             "title": "King of the Beach", "year": "2010"}]
    try:
        response = client.upsert(PATH, data)
    except Exception, e:
        assert isinstance(e, requests.exceptions.HTTPError)
コード例 #8
0
def log_request():
    print 'log_request', request.url
    print (socrata_app_token and socrata_username and socrata_password and socrata_access_log_domain and socrata_access_log_datasetid)
    if socrata_app_token and socrata_username and socrata_password and socrata_access_log_domain and socrata_access_log_datasetid:
        client = Socrata(socrata_access_log_domain, socrata_app_token, username=socrata_username, password=socrata_password)
        
        # fix this, see http://esd.io/blog/flask-apps-heroku-real-ip-spoofing.html
        if not request.headers.getlist("X-Forwarded-For"):
            ip = request.remote_addr
        else:
            ip = request.headers.getlist("X-Forwarded-For")[0]
        # for some reason a space and a * is causing an upsert error so am replacing space with %20
        url = str(request.url).replace(" ", "%20")
        # See Socrata's time format https://support.socrata.com/hc/en-us/articles/202949918-Importing-Data-Types-and-You-
        dtnow = datetime.utcnow().isoformat()
        dtnow = dtnow[:dtnow.index('.')]+'Z' 
        data = [{'datetime': dtnow, 'ip_address': str(ip), 'url': url}]
        print data
        print 'upsert', client.upsert(socrata_access_log_datasetid, data)
コード例 #9
0
ファイル: test_soda.py プロジェクト: sunsure/sodapy
def test_upsert_exception():
    mock_adapter = {}
    mock_adapter["prefix"] = PREFIX
    adapter = requests_mock.Adapter()
    mock_adapter["adapter"] = adapter
    client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter)

    path = "/songs.json"
    response_data = "403_response_json.txt"
    set_up_mock(adapter, "POST", response_data, 403, reason="Forbidden")

    data = [{
        "theme": "Surfing",
        "artist": "Wavves",
        "title": "King of the Beach",
        "year": "2010"
    }]
    try:
        response = client.upsert(PATH, data)
    except Exception, e:
        assert isinstance(e, requests.exceptions.HTTPError)
コード例 #10
0
texas_df['UID'] = texas_df.apply(make_uid, axis=1)

texas_df['state'] = "Texas"

##########
# Combine the Data Frames
##########

# Concatenate the dataframes into one unified DF
df_combined = pd.concat([tracts_df, top_30_df, counties_df, texas_df],
                        sort=True)

# Replace NaN values that Socrata doesn't like
df_combined['place'] = df_combined['place'].fillna("N/A")
df_combined['state'] = df_combined['state'].fillna("N/A")
df_combined['county'] = df_combined['county'].fillna("N/A")
df_combined['tract'] = df_combined['tract'].fillna(0)

json_data = df_combined.to_dict(orient="records")

##########
# Send Data to Socrata
##########

try:
    socrata_upsert = socrata_client.upsert('evvr-qwa7', json_data)
    print(socrata_upsert)
except Exception as e:
    print("Exception, could not insert: " + str(e))
    print("Query: '%s'" % json_data)
コード例 #11
0
class MDSSocrata:

    __slots__ = [
        "mds_config",
        "mds_http_graphql",
        "mds_socrata_dataset",
        "provider_name",
        "query",
        "client",
    ]

    def __del__(self):
        """
        Make sure the client is closed whenever the class is destructed.
        :return:
        """
        try:
            self.client.close()
        finally:
            self.client = None

    def __str__(self) -> str:
        """
        Returns the configuration as string.
        :return:
        """
        return str({
            "SOCRATA_DATA_ENDPOINT":
            self.mds_config.get_setting("SOCRATA_DATA_ENDPOINT", None),
            "SOCRATA_DATASET":
            self.mds_config.get_setting("SOCRATA_DATASET", None),
            "SOCRATA_APP_TOKEN":
            self.mds_config.get_setting("SOCRATA_APP_TOKEN", None),
            "SOCRATA_KEY_ID":
            self.mds_config.get_setting("SOCRATA_KEY_ID", None),
            "SOCRATA_KEY_SECRET":
            self.mds_config.get_setting("SOCRATA_KEY_SECRET", None),
        })

    def __init__(self, provider_name, mds_config, mds_gql):
        """
        Constructor for the init class.
        :param str provider_name: The name of the provider
        :param MDSConfig mds_config: The configuration class where we can gather our endpoint
        :param MDSGraphQLRequest mds_gql: The http graphql class we need to make requests
        :return:
        """
        self.provider_name = provider_name
        self.mds_config = mds_config
        self.mds_http_graphql = mds_gql
        self.mds_socrata_dataset = self.mds_config.get_setting(
            "SOCRATA_DATASET", None)
        self.client = Socrata(
            self.mds_config.get_setting("SOCRATA_DATA_ENDPOINT", None),
            self.mds_config.get_setting("SOCRATA_APP_TOKEN", None),
            username=self.mds_config.get_setting("SOCRATA_KEY_ID", None),
            password=self.mds_config.get_setting("SOCRATA_KEY_SECRET", None),
            timeout=20,
        )
        self.query = Template("""
        query getTrips {
          api_trips(
                where: {
                provider: { provider_name: { _eq: "$provider_name" }}
                end_time: { _gte: "$time_min" },
                _and: { end_time: { _lt: "$time_max" }}
              }
          ) {
            trip_id: id
            device_id: device { id }
            vehicle_type
            trip_duration
            trip_distance
            start_time
            end_time
            modified_date
            council_district_start
            council_district_end
            census_geoid_start
            census_geoid_end
          }
        }
        """)

    def get_query(self, time_min, time_max) -> str:
        """
        Returns a string with the new query based on limit and offset.
        :param str time_min: The minimum time the trip ended
        :param str time_max: The maximum time the trip ended
        :return str:
        """
        if isinstance(self.provider_name, str) is False:
            raise Exception("provider_name must be a string")
        if isinstance(time_min, str) is False:
            raise Exception("time_min must be a sql datetime string")
        if isinstance(time_max, str) is False:
            raise Exception("time_max must be a sql datetime string")
        return self.query.substitute(provider_name=self.provider_name,
                                     time_min=time_min,
                                     time_max=time_max)

    def get_data(self, time_min, time_max) -> dict:
        """
        Gathers data from the API endpoint
        :param str time_min:
        :param str time_max:
        :return dict:
        """
        query = self.get_query(time_min=time_min, time_max=time_max)
        return self.mds_http_graphql.request(query)

    def get_config(self) -> dict:
        """
        Returns the configuration dictionary for testing.
        :return dict:
        """
        return self.mds_config.get_config()

    def save(self, data) -> dict:
        """
        Upserts the data into Socrata
        :param dict data: The data to be saved unto socrata.
        :return dict:
        """
        data = list(map(self.clean_trip_device_id, data))
        data = list(map(self.parse_datetimes, data))
        data = list(map(self.check_geos_data, data))
        if self.client is not None:
            return self.client.upsert(self.mds_socrata_dataset, data)
        else:
            raise Exception(
                "The socrata client is not initialized correctly, check your API credentials."
            )

    def parse_datetimes(self, data) -> dict:
        """
        Parses the PostgreSQL datetime with timezone into an insertable
        socrata timestamp in CST time. It also adds necessary fields,
        such as year, month, hour and day of the week.
        :param data:
        :return:
        """
        fmt = "%Y-%m-%dT%H:%M:%S"
        end_time = self.datetime_to_cst(data["end_time"])
        data["start_time"] = self.datetime_to_cst(
            data["start_time"]).strftime(fmt)
        data["end_time"] = end_time.strftime(fmt)
        data["modified_date"] = self.datetime_to_cst(
            data["modified_date"]).strftime(fmt)
        data["year"] = end_time.year
        data["month"] = end_time.month
        data["hour"] = end_time.hour
        data["day_of_week"] = end_time.weekday()
        return data

    @staticmethod
    def check_geos_data(data) -> dict:
        """
        Parses the PostgreSQL datetime with timezone into an insertable
        socrata timestamp in CST time. It also adds necessary fields,
        such as year, month, hour and day of the week.
        :param data:
        :return:
        """
        geos_fields = [
            "council_district_start",
            "council_district_end",
            "census_geoid_start",
            "census_geoid_end",
        ]

        for field in geos_fields:
            if data[field] is None or data[field] == "None":
                data[field] = 0
            else:
                data[field] = data.get(field, 0)
        return data

    @staticmethod
    def datetime_to_cst(timestamp):
        return parser.parse(timestamp).astimezone(tz.gettz("CST"))

    @staticmethod
    def clean_trip_device_id(trip) -> dict:
        """
        Transforms the device_id field from a dictionary into a string value
        :param dict trip: The trip dictionary being transformed
        :return dict:
        """
        trip["device_id"] = trip["device_id"]["id"]
        return trip
コード例 #12
0
class SocrataDataset(object):
    def __init__(self,
                 dataset_id,
                 socrata_client=None,
                 socrata_params={},
                 float_fields=[]):
        self.dataset_id = dataset_id
        self.client = socrata_client
        if not socrata_client and socrata_params:
            self.client = Socrata(**socrata_params)
        self.socrata_params = socrata_params
        self.col_dtype_dict = self.get_col_dtype_dict()
        self.float_fields = float_fields

    def get_col_dtype_dict(self):
        '''
        Retrieve data dictionary of a Socrata data set in the form of a dictionary,
        with the key being the column name and the value being the column data type

    	Returns:
    		data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type
        '''
        dataset_col_meta = self.client.get_metadata(self.dataset_id)['columns']
        col_dtype_dict = {
            col['name']: col['dataTypeName']
            for col in dataset_col_meta
        }
        return col_dtype_dict

    def mod_dtype(self, rec, col_dtype_dict=None, float_fields=None):
        '''
        Make sure the data type of each field in the data record matches the data type
        of the field in the Socrata data set.

    	Parameters:
    		rec: dictionary object of the data record
            col_dtype_dict: data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type
            float_fields: list of fields that should be a float

    	Returns:
    		dictionary object of the data record, with number, string, and boolean fields
            modified to align with the data type of the corresponding Socrata data set
        '''
        col_dtype_dict = col_dtype_dict or self.col_dtype_dict
        float_fields = float_fields or self.float_fields

        identity = lambda x: x
        dtype_func = {'number': float, 'text': str, 'checkbox': bool}
        out = {}
        for k, v in rec.items():
            if k in float_fields and k in col_dtype_dict:
                out[k] = float(v)
            elif k in col_dtype_dict:
                if v is not None and v is not '':
                    out[k] = dtype_func.get(
                        col_dtype_dict.get(k, 'nonexistentKey'), identity)(v)
        out = {k: v for k, v in out.items() if k in col_dtype_dict}
        return out

    def create_new_draft(self):
        draftDataset = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, self.dataset_id),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']),
            params={'method': 'copySchema'})
        logger.info(draftDataset.json())
        draftId = draftDataset.json()['id']
        return draftId

    def publish_draft(self, draftId):
        time.sleep(5)
        publishResponse = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, draftId),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']))
        logger.info(publishResponse.json())
        return publishResponse

    def delete_draft(self, draftId):
        time.sleep(5)
        deleteResponse = self.client.delete(draftId)
        if deleteResponse.status_code == 200:
            logger.info('Empty draft {} has been discarded.'.format(draftId))
        return deleteResponse

    def clean_and_upsert(self, recs, dataset_id=None):
        dataset_id = dataset_id or self.dataset_id
        out_recs = [self.mod_dtype(r) for r in recs]
        uploadResponse = self.client.upsert(dataset_id, out_recs)
        return uploadResponse
コード例 #13
0
            offset=offset,
            date_limit=get_date_limit(),
            initial_date_limit=get_initial_date_limit())
        offset += limit
        data = run_hasura_query(query)

        if "data" not in data:
            print("query: ", query)
            print("data: ", data)
            raise RuntimeError("There was a problem gathering data.")

        # Format records
        records = config["formatter"](data, config["formatter_config"])

        # Upsert records to Socrata
        client.upsert(config["dataset_uid"], records)
        total_records += len(records)

        if len(records) == 0:
            print(f'{total_records} {config["table"]} records upserted.')
            print(f'Completed {config["table"]} table.')
        elif total_records != 0:
            print(f"{total_records} records upserted")

# Terminate Socrata connection
client.close()

# Stop timer and print duration
end = time.time()
hours, rem = divmod(end - start, 3600)
minutes, seconds = divmod(rem, 60)
コード例 #14
0
    'bounce_rate',
    'entrance_rate',
    'exit_rate'
]

with open(os.path.join(report_folder, 'all-pages.json'), encoding='utf8') as json_file:
    data = json.load(json_file)
    taken_at = datetime.strptime(data['taken_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
    datestamp = (taken_at - timedelta(days=1)).strftime('%Y-%m-%d')

    for page in data['data']:
        page = clean_ga_keys(page)
        visits = int(page.pop('visits'))

        page['date'] = datestamp
        page['id'] = datestamp + page['domain'] + page['page']
        page['pageviews'] = visits

        if visits > 0:
            page['bounce_rate'] = 100 * float(page['bounces']) / visits
            page['entrance_rate'] = 100 * float(page['entrances']) / visits
            page['exit_rate'] = 100 * float(page['exits']) / visits

        for key in percentages:
            page[key] = round(float(page.get(key, 0)), 2)

    upsert_chunks = [ data['data'][x:x+soda_batch_size] for x in range(0, len(data['data']), soda_batch_size) ]

    for chunk in upsert_chunks:
        soda_client.upsert(os.environ["SOCRATA_RESOURCEID"], chunk)
コード例 #15
0
class SocrataClient(Config):
    def __init__(self, **kwargs):
        self.client = None
        self.service_data = kwargs.get('service_data', None)
        self.dataset_id = kwargs.get('dataset_id', None)
        super(SocrataClient, self).__init__()

    def _connect(self):
        self.client = Socrata(self.credentials['socrata_url'],
                              self.credentials['socrata_token'],
                              self.credentials['socrata_username'],
                              self.credentials['socrata_password'])
        # See:
        # https://stackoverflow.com/questions/47514331/readtimeout-error-for-api-data-with-sodapy-client
        self.client.timeout = SOCRATA_MASTER_TIMEOUT

    def upsert(self):
        """
        NOTE: Unique ID in Socrata is set via UI
        https://support.socrata.com/hc/en-us/articles/ \
            360008065493-Setting-a-Row-Identifier-in-the-Socrata-Data-Management-Experience
        """
        if self.dataset_id is None:
            logging.error('[SOCRATA_CLIENT] No Socrata dataset ID provided')
            return
        if self.client is None:
            self._connect()
        groomed_data = self.dict_transform()
        data = self.send_data(groomed_data=groomed_data)

        logging.info('[SOCRATA_CLIENT] Upserting data')
        logging.info(self.client.upsert(self.dataset_id, data))

    def send_data(self, groomed_data):
        data = []
        for row in groomed_data:
            for key, entry in row.items():
                # Deformat and replace dates
                if isinstance(entry, datetime.datetime):
                    replacement = self.__deformat_date(entry)
                    row[key] = replacement
            data.append(row)
        return data

    def dict_transform(self):
        final_report = []
        for key, entry in self.service_data.items():
            # Add ID, go into dict
            row = {'id': key}
            for subkey, subent in entry.items():
                row[subkey] = subent
            final_report.append(row)
        return final_report

    def json_to_csv(self, filename='soctemp.csv'):
        """
        Convert JSON to temporary CSV file
        for initial upload
        """
        tempfile = os.path.join(ROOT_DIR, 'tmp', filename)
        fieldnames = set()
        final_report = []
        for key, entry in self.service_data.items():
            fieldnames.add('id')
            # Add ID, go into dict
            row = {'id': key}
            for subkey, subent in entry.items():
                row[subkey] = subent
                fieldnames.add(subkey)
            final_report.append(row)

        with open(tempfile, 'w', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=list(fieldnames))
            writer.writeheader()
            for row in final_report:
                writer.writerow(row)

    @staticmethod
    def __deformat_date(date):
        return date.isoformat()
コード例 #16
0
def lambda_handler(event, context):
    '''
    Method called by Amazon Web Services when the lambda trigger fires.
    This lambda is configured to be triggered at regular intervals.
    It will remove old records from Socrata data sets to keep it at
    a certain size.

    Parameters:
    	event, context: Amazon Web Services required parameters. Describes triggering event.
    '''
    logger.info("Connecting to Socrata")
    client = Socrata(SOCRATA_DOMAIN,
                     SOCRATA_API_KEY,
                     SOCRATA_USERNAME,
                     SOCRATA_PASSWORD,
                     timeout=400)

    # check number of records in each socrata data set
    # keep track of data sets that have more than x records
    id_toDelete_dict = {}
    for datasetId in SOCRATA_DATASET_IDS:
        r = requests.get("https://{}/resource/{}.json?$select=count(*)".format(
            SOCRATA_DOMAIN, datasetId),
                         auth=HTTPBasicAuth(SOCRATA_USERNAME,
                                            SOCRATA_PASSWORD))
        r = r.json()
        count = int(r[0]['count'])
        if count > DATASET_LIMIT:
            toDelete = count - DATASET_LIMIT
            id_toDelete_dict[datasetId] = toDelete
            logger.info(
                'data set {} has {} rows - removing oldest {} rows'.format(
                    datasetId, count, toDelete))
        else:
            logger.info('data set {} has {} rows - no action needed'.format(
                datasetId, count))
        time.sleep(5)

    # loop through data set to remove excess records
    # remove the oldest records to keep the data set at a manageable size
    # wait at end of each loop if hitting error
    wait = False
    while bool(id_toDelete_dict):
        for datasetId, toDelete in id_toDelete_dict.items():
            try:
                # remove at most 10000 records at a time to avoid timeouts
                N = min(toDelete, 10000)
                retrievedRows = client.get(datasetId,
                                           limit=N,
                                           exclude_system_fields=False,
                                           select=':id')
                deleteList = [{
                    ':id': row[':id'],
                    ':deleted': True
                } for row in retrievedRows]
                result = client.upsert(datasetId, deleteList)
                logger.info(id_toDelete_dict)
                toDelete = toDelete - N
                id_toDelete_dict[datasetId] = toDelete
                if not toDelete > 0:
                    del id_toDelete_dict[datasetId]
                if len(id_toDelete_dict) == 1:
                    time.sleep(1)
            except:
                logger.info(
                    'Error while deleting {} records in {}. Skipping for now'.
                    format(N, datasetId))
                wait = True
        if wait:
            logger.info('Slow down a bit. Wait 2 seconds.')
            time.sleep(2)
            wait = False
コード例 #17
0
def run(**kwargs):
    """
    Runs the program based on the above flags, the values will be passed to kwargs as a dictionary
    :param dict kwargs: The values specified by click decorators.
    :return:
    """
    # Start timer
    start = time.time()

    SOCRATA_DATASET = mds_config.get_setting("SOCRATA_DATASET", None)
    SOCRATA_ENDPOINT = mds_config.get_setting("SOCRATA_DATA_ENDPOINT", None)
    SOCRATA_APP_TOKEN = mds_config.get_setting("SOCRATA_APP_TOKEN", None)
    SOCRATA_KEY_ID = mds_config.get_setting("SOCRATA_KEY_ID", None)
    SOCRATA_KEY_SECRET = mds_config.get_setting("SOCRATA_KEY_SECRET", None)

    HASURA_ENDPOINT = mds_config.get_setting("HASURA_ENDPOINT", None)
    # Prep Hasura query
    HASURA_HEADERS = {
        "Content-Type": "application/json",
        "X-Hasura-Admin-Secret":
        mds_config.get_setting("HASURA_ADMIN_KEY", None)
    }

    time_min = kwargs.get("time_min", None)
    time_max = kwargs.get("time_max", None)

    query = """
        query getTrips($timeMin: timestamptz!, $timeMax: timestamptz!) {
          api_trips(
                where: {
                  end_time: { _gte: $timeMin }
                  _and: { 
                    end_time: { _lt: $timeMax }
                  },
                }
          ) {
            trip_id: id
            device_id: device { id }
            vehicle_type
            trip_duration
            trip_distance
            start_time
            end_time
            modified_date
            council_district_start
            council_district_end
            census_geoid_start
            census_geoid_end
          }
        }
    """

    response = requests.post(HASURA_ENDPOINT,
                             data=json.dumps({
                                 "query": query,
                                 "variables": {
                                     "timeMin": time_min,
                                     "timeMax": time_max,
                                 }
                             }),
                             headers=HASURA_HEADERS)

    def datetime_to_cst(timestamp) -> str:
        """
        Returns a datetime in CST timezone
        :param timestamp:
        :return:
        """
        try:
            return parser.parse(timestamp).astimezone(tz.gettz("CST"))
        except:
            return timestamp

    def parse_datetimes(data) -> dict:
        """
        Parses the PostgreSQL datetime with timezone into an insertable
        socrata timestamp in CST time. It also adds necessary fields,
        such as year, month, hour and day of the week.
        :param data:
        :return:
        """
        fmt = "%Y-%m-%dT%H:%M:%S"
        end_time = datetime_to_cst(data["end_time"])
        data["start_time"] = datetime_to_cst(data["start_time"]).strftime(fmt)
        data["end_time"] = end_time.strftime(fmt)
        data["modified_date"] = datetime_to_cst(
            data["modified_date"]).strftime(fmt)
        data["year"] = end_time.year
        data["month"] = end_time.month
        data["hour"] = end_time.hour
        data["day_of_week"] = end_time.weekday()
        return data

    def clean_trip(trip) -> dict:
        """
        Transforms the device_id field from a dictionary into a string value
        :param dict trip: The trip dictionary being transformed
        :return dict:
        """
        trip["device_id"] = trip["device_id"]["id"]
        trip = parse_datetimes(trip)
        return trip

    #
    # Generate the
    #
    try:
        trips = list(map(clean_trip, response.json()["data"]["api_trips"]))
    except:
        trips = []

    if len(trips) == 0:
        print("Nothing to do here, exiting.")
        exit(0)
    else:
        print(f"Total trips to process: {len(trips)}")

    print("Connecting to Socrata")

    # Setup connection to Socrata
    client = Socrata(
        SOCRATA_ENDPOINT,
        SOCRATA_APP_TOKEN,
        username=SOCRATA_KEY_ID,
        password=SOCRATA_KEY_SECRET,
        timeout=7200,
    )

    print(
        "Making upsert... this can take a while but not more than 2 hours. Please wait."
    )
    client.upsert(SOCRATA_DATASET, trips)

    # Stop timer and print duration
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Finished in: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),
                                                       int(minutes), seconds))
コード例 #18
0
import requests
from sodapy import Socrata
import json
# store api token in creds.txt
with open('creds.json', 'r') as f:
    creds = json.loads(f.read())
client = Socrata("odn.data.socrata.com", creds["token"], username=creds['username'], password=creds['password'])
data = requests.get('https://odn.data.socrata.com/resource/jwbj-xtgt.json?$where=region_name%20IS%20NOT%20NULL%20AND%20location%20IS%20NULL').json()
for row in data:
    if len(row['region_name'].split(',')) > 2:
        location = requests.get('http://nominatim.openstreetmap.org/search/?q=%s&format=json' % (row['region_name'])).json()
    else:
        location = requests.get('http://nominatim.openstreetmap.org/search/?q=%s,usa&format=json' % (row['region_name'])).json()
    if location:
        location = location[0]
        row['location'] = "(%s, %s)" % (location['lat'], location['lon'])
client.upsert("k53q-ytmx", data)
コード例 #19
0
class SocrataDataset(object):
    """
    Helper class for interacting with datasets in Socrata.

    """
    logger = None

    def __init__(self,
                 dataset_id,
                 socrata_client=None,
                 socrata_params=None,
                 float_fields=None,
                 logger=None):
        """
        Initialization function of the SocrataDataset class.

        Parameters:
            dataset_id: 4x4 ID of the Socrata draft (e.g. x123-bc12)
            client: Optional parameter if the user chooses to pass in the
                socrata_params parameter. If user chooses not to pass in
                socrata_params, they can also pass in an sodapy.Socrata object
                that has been initialized with the proper socrata credentials.
            socrata_params: Optional parameter if the user choose to pass in the
                socrata_client parameter. Dictionary object containing Socrata
                credentials. Must include the following fields: 'username',
                'password', 'app_token', 'domain'.
            float_fields: An array of Socrata field names that should be of
                float types (numbers with decimals).
            logger: Optional parameter. Could pass in a logger object or not pass
                in anything. If a logger object is passed in, information will be
                logged instead of printed. If not, information will be printed.
        """
        self.socrata_params = {}
        self.float_fields = []
        self.dataset_id = dataset_id
        self.client = socrata_client
        if not socrata_client and socrata_params:
            self.client = Socrata(**socrata_params)
        self.socrata_params = socrata_params
        self.col_dtype_dict = self.get_col_dtype_dict()
        self.float_fields = float_fields
        self.print_func = print
        if logger:
            self.print_func = logger.info

    def get_col_dtype_dict(self):
        """
        Retrieve data dictionary of a Socrata data set in the form of a dictionary,
        with the key being the column name and the value being the column data type

    	Returns:
    		Data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type.
        """
        dataset_col_meta = self.client.get_metadata(self.dataset_id)['columns']
        col_dtype_dict = {
            col['name']: col['dataTypeName']
            for col in dataset_col_meta
        }
        return col_dtype_dict

    def mod_dtype(self, rec, col_dtype_dict=None, float_fields=None):
        """
        Make sure the data type of each field in the data record matches the data type
        of the field in the Socrata data set.

    	Parameters:
    		rec: dictionary object of the data record
            col_dtype_dict: data dictionary of a Socrata data set in the form of a dictionary,
            with the key being the column name and the value being the column data type
            float_fields: list of fields that should be a float

    	Returns:
    		Dictionary object of the data record, with number, string, and boolean fields
            modified to align with the data type of the corresponding Socrata data set.
        """
        col_dtype_dict = col_dtype_dict or self.col_dtype_dict
        float_fields = float_fields or self.float_fields

        identity = lambda x: x
        dtype_func = {'number': float, 'text': str, 'checkbox': bool}
        out = {}
        for k, v in rec.items():
            if k in float_fields and k in col_dtype_dict:
                out[k] = float(v)
            elif (k in col_dtype_dict and v not in [None, '']):
                out[k] = dtype_func.get(
                    col_dtype_dict.get(k, 'nonexistentKey'), identity)(v)
        out = {k: v for k, v in out.items() if k in col_dtype_dict}
        return out

    def create_new_draft(self):
        """
        Create a new draft of the current dataset.

        Returns:
            Draft ID of the new draft.
        """
        draft_dataset = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, self.dataset_id),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']),
            params={'method': 'copySchema'})
        logger.info(draft_dataset.json())
        draft_id = draft_dataset.json()['id']
        return draft_id

    def publish_draft(self, draft_id):
        """
        Publish the Socrata draft specified.

        Parameters:
            draft_id: 4x4 ID of the Socrata draft (e.g. x123-bc12)

        Returns:
            Response of the publish draft request.
        """
        time.sleep(5)
        publish_response = requests.post(
            'https://{}/api/views/{}/publication.json'.format(
                self.client.domain, draft_id),
            auth=(self.socrata_params['username'],
                  self.socrata_params['password']))
        logger.info(publish_response.json())
        return publish_response

    def delete_draft(self, draft_id):
        """
        Delete the Socrata draft specified.

        Parameters:
            draft_id: 4x4 ID of the Socrata draft (e.g. x123-bc12)

        Returns:
            Response of the delete draft request.
        """
        time.sleep(5)
        delete_response = self.client.delete(draft_id)
        if delete_response.status_code == 200:
            logger.info('Empty draft {} has been discarded.'.format(draft_id))
        return delete_response

    def clean_and_upsert(self, recs, dataset_id=None):
        """
        Publish the Socrata draft specified.

        Parameters:
            recs: an array of dictionary objects of the data to upsert.
            dataset_id: 4x4 ID of the Socrata dataset (e.g. x123-bc12) to perform
            upserts to. This parameter is not required if you are performing upserts to the
            dataset you've initialized this class with.

        Returns:
            A dictionary object with the following fields:
            'Rows Deleted' - number of rows deleted due to the upsert request
            'Rows Updated' - number of rows updated due to the upsert request
            'Rows Created' - number of rows created due to the upsert request
        """
        dataset_id = dataset_id or self.dataset_id
        out_recs = [self.mod_dtype(r) for r in recs]
        upload_response = self.client.upsert(dataset_id, out_recs)
        return upload_response