コード例 #1
0
def fetch_311_data(zip,
                   max_query_results=None,
                   num_entries_to_search=10000,
                   t_out=10) -> Dict[str, any]:
    nyc_311_dataset_domain = "data.cityofnewyork.us"
    nyc_311_dataset_identifier = "fhrw-4uyv"
    try:
        nyc_311_dataset_token = get_311_socrata_key()
    except KeyError:
        nyc_311_dataset_token = (
            None  # works with None but lower number of requests can be made
        )

    client = Socrata(nyc_311_dataset_domain, nyc_311_dataset_token)

    client.timeout = t_out

    try:
        return client.get(
            nyc_311_dataset_identifier,
            select=
            "created_date, incident_zip, incident_address, city, complaint_type, descriptor, status",
            # q=str(zip), #uncomment if want to query directly on the server side (may lead to timeout)
            order="created_date DESC",
            limit=num_entries_to_search,
        )
    except requests.exceptions.Timeout:
        raise TimeoutError
    def run(self):
        # Autenticación en S3
        ses = boto3.session.Session(profile_name='luigi_dpa',
                                    region_name='us-west-2')
        s3_resource = ses.resource('s3')

        obj = s3_resource.Bucket(self.bucket)
        print(ses)
        # Autenticación del cliente:
        client = Socrata(settings.get('dburl'),
                         settings.get('apptoken'),
                         username=settings.get('user'),
                         password=settings.get('pass'))

        # los resultados son retornados como un archivo JSON desde la API /
        # convertida a una lista de Python usando sodapy
        client.timeout = 1000
        limit = 1000000000

        # query
        results = client.get(
            "erm2-nwe9",
            limit=limit,
            where=
            f"created_date between '{self.year}-{self.month}-{self.day}T00:00:00.000' and '{self.year}-{self.month}-{self.day}T23:59:59.999'"
        )
        with self.output().open('w') as json_file:
            json.dump(results, json_file)
コード例 #3
0
def fetch_nycOpenData(url, timeout, row_limit):
    client = Socrata("data.cityofnewyork.us", "eXBsiqAwodiCMHDYEheExaF3v",
                     "*****@*****.**", "Monkeydluffy55!")

    client.timeout = timeout
    # Returned as JSON from API / converted to Python list of dictionaries by sodapy.
    results = client.get(url, limit=row_limit)

    # Convert to pandas DataFrame
    results_df = pd.DataFrame.from_records(results)

    return results_df
コード例 #4
0
def setup_socrata_client(credentials, nadac_parameters):
    """
    Setup client to access database on Socrata.

    Args:
        credentials (dict): Socrata app token from .env file
        nadac_parameters (dict): Parameters for downloading NADAC dataset from .env file
            WEBSITE: url of dataset (less 'http://www.')

    Returns:
        Socrata client
    """
    client = Socrata(nadac_parameters['WEBSITE'], credentials['APP_TOKEN'])
    client.timeout = int(nadac_parameters['TIMEOUT'])
    return client
コード例 #5
0
def main():
    logger.info('Creating Spark session')
    spark = pyspark.sql.SparkSession.builder. \
        master('local[*]'). \
        config("spark.sql.warehouse.dir", HIVE_WAREHOUSE_DIR). \
        config("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", True). \
        enableHiveSupport(). \
        getOrCreate()  # create spark session to dump the data into Hive
    spark.sparkContext.setLogLevel('ERROR')
    logger.info('Initializing Hive database')
    spark.sql(f'drop database if exists {HIVE_DATABASE} cascade')
    spark.sql(f'create database {HIVE_DATABASE}')

    logger.info('Accessing remote dataset')
    soda_client = Socrata(DOMAIN, APP_TOKEN)  # create client for accessing API
    soda_client.timeout = 50  # otherwise we will get a lot of timeout errors
    dataset_size = query_size(soda_client, DATASET)
    num_batches = dataset_size // BATCH_SIZE + 1
    logger.info('Remote dataset size: %d (%d batches)', dataset_size,
                num_batches)

    schema = make_schema()  # spark dataframe schema for our data

    logger.info('Starting process pool')
    with multiprocessing.Pool(NUM_WORKERS) as pool:
        fetch_batch_partial = functools.partial(fetch_batch,
                                                soda_client=soda_client,
                                                batch_size=BATCH_SIZE)
        batch_it = pool.imap_unordered(fetch_batch_partial, range(num_batches))
        logger.info('Fetching data')
        for idx, batch in tqdm(batch_it, total=num_batches):
            df = spark.createDataFrame(batch, schema=schema)
            df = enforce_types(df)
            df.createOrReplaceTempView('tmp_table')
            if spark.catalog._jcatalog.tableExists(HIVE_TABLE):
                spark.sql(f'insert into {HIVE_TABLE} from tmp_table')
            else:
                spark.sql(
                    f'create table {HIVE_TABLE} as select * from tmp_table')

    logger.info('Probing the database')
    df = spark.sql(f"select * from {HIVE_TABLE} limit 200")
    logger.info(f'Received {df.count()} rows from {HIVE_TABLE}')

    soda_client.close()
    spark.stop()
    logger.info('Fetching finished')
    logger.info(f'All data has been written to Hive table {HIVE_TABLE}')
コード例 #6
0
def get_ozone_data(yr, st):
    """
  Get ozone data from the CDC API
  
  yr - year of interest - string
  st - fips code for the state of interest - string
  """

    #establish connection to the CDC's data via Socrata
    client = Socrata("data.cdc.gov", parsed_yaml['cdc_key'],
                     parsed_yaml['cdc_username'], parsed_yaml['cdc_password'])

    #set timeout to 60 seconds
    client.timeout = 60

    #get number of records in the dataset
    record_count = client.get("kmf5-t9yc",
                              where=f"year2 = '{yr}' AND statefips = '{st}'",
                              select="COUNT(*)")

    print("The record count is", record_count)
    print("Getting data from the Socrata API...")

    #get data from dataset
    start = 0  #start at page 0
    chunk_size = 50000  #fetch 50,000 rows at a time
    results = []  #empty list to store data
    while True:
        #add data to the list
        results.extend(
            client.get(
                "kmf5-t9yc",
                where=f"year2 = '{yr}' AND statefips = '{st}'",  # SQL query
                select=
                "year2, month, countyfips, o3_max_pred",  # interested columns
                offset=start,
                limit=chunk_size))
        #pagination
        start = start + chunk_size
        print("At record number", start)
        #stop adding to the list once all the data is fetched
        if (start > int(record_count[0]['COUNT'])):
            break

    #return list so that it can be stored in a dataframe
    return results
コード例 #7
0
    def run(self):
        '''
        Consulta de los datos en la api de 311
        '''
        # Autenticación del cliente:
        client = Socrata(settings.get('dburl'),
                         settings.get('apptoken'),
                         username=settings.get('user'),
                         password=settings.get('pass'))

        # los resultados son retornados como un archivo JSON desde la API /
        # convertida a una lista de Python usando sodapy
        client.timeout = 50
        results = client.get("erm2-nwe9", limit=1)

        with self.output().open('w') as json_file:
            json.dump(results, json_file)
コード例 #8
0
def get_trip_records(limit=100000):

    client = Socrata('data.cityofchicago.org',
                     'Tk6RhuGAFvF9P4ehsysybj3IW',
                     username="******",
                     password="******")

    client.timeout = 10000

    results = client.get(
        "m6dm-c72p",
        limit=limit,
        select=
        '''trip_id, trip_start_timestamp, trip_end_timestamp, trip_seconds, 
                                                        trip_miles, pickup_community_area, dropoff_community_area, fare, 
                                                        tip, additional_charges, trip_total'''
    )

    return pd.DataFrame.from_records(results)
    def run(self):
        # Autenticación del cliente:
        client = Socrata(settings.get('dburl'),
                         settings.get('apptoken'),
                         username=settings.get('user'),
                         password=settings.get('pass'))

        # los resultados son retornados como un archivo JSON desde la API /
        # convertida a una lista de Python usando sodapy
        client.timeout = 1000
        limit = 1000000000

        # crear carpeta raw
        if not os.path.exists(f'{path_raw}'):
            os.mkdir(f'{path_raw}')
        else:
            None

        # crear carpeta year
        if not os.path.exists(f'{path_raw}/{self.year}'):
            os.mkdir(f'{path_raw}/{self.year}')
        else:
            None

        # crear carpeta year/month
        if not os.path.exists(f'{path_raw}/{self.year}/{self.month}'):
            os.mkdir(f'{path_raw}/{self.year}/{self.month}')
        else:
            None

        # crear carpeta  year/month/day
        if not os.path.exists(f'{path_raw}/{self.year}/{self.month}/{self.day}'):
            os.mkdir(f'{path_raw}/{self.year}/{self.month}/{self.day}')
        else:
            None

        # query
        results = client.get(
            "erm2-nwe9", limit=limit, where=f"created_date between '{self.year}-{self.month}-{self.day}T00:00:00.000' and '{self.year}-{self.month}-{self.day}T23:59:59.999'")
        with self.output().open('w') as json_file:
            json.dump(results, json_file)
コード例 #10
0
def queryApi311(year, month, day):
    # Usado en Task1 : Consulta a la API
    # Autenticación del cliente:
    client = Socrata(settings.get('dburl'),
                     settings.get('apptoken'),
                     username=settings.get('user'),
                     password=settings.get('pass'))

    # los resultados son retornados como un archivo JSON desde la API /
    # convertida a una lista de Python usando sodapy
    client.timeout = 1000
    limit = 1000000000

    # query
    results = client.get(
        "erm2-nwe9",
        limit=limit,
        where=
        f"created_date between '{year}-{month}-{day}T00:00:00.000' and '{year}-{month}-{day}T23:59:59.999'"
    )

    return results
コード例 #11
0
def get_data(chunk_size=100000, begin_date='2020-01-01'):
    #define parameters for endpoint, dataset, and app token
    path = '../data/'
    data_url = 'data.cityofnewyork.us'
    dataset = 'erm2-nwe9'
    with open(path + 'client_secret.json') as f:
        credentials = json.load(f)
    app_token = credentials['app_token']

    #sets up the connection, need application token to override throttling limits
    #username and password only required for creating or modifying data
    client = Socrata(data_url, app_token)
    client.timeout = 6000

    #count number of records in desired dataset
    record_count = client.get(dataset,
                              select='count(*)',
                              where="created_date >='2020-01-01'")
    total_count = record_count[0]['count']
    print(total_count)

    start = 0
    results = []
    #paginate through dataset in sets of 10000 to get all records since start of 2020
    while True:
        print(f'{start} rows retrieved')
        results.extend(
            client.get(
                dataset,
                select=
                "unique_key, created_date, closed_date, agency, agency_name, complaint_type, descriptor, location_type, incident_zip, borough, address_type, city, status, latitude, longitude, location",
                where="created_date >= '2020-02-01'",
                limit=chunk_size,
                offset=start))
        start += chunk_size
        if start > int(total_count):
            break
    return results
コード例 #12
0
    def run(self):
        '''
        Consulta de los datos en la api de 311
        '''
        # Autenticación en S3
        ses = boto3.session.Session(profile_name='luigi_dpa', region_name='us-west-2')
        s3_resource = ses.resource('s3')

        obj = s3_resource.Bucket(self.bucket)
        print(ses)
        # Autenticación del cliente:
        client = Socrata("data.cityofnewyork.us",
                        "N2WpW61JnP5RoT5mrYGUaSUg9",
                        username="******",
                        password="******")

        # los resultados son retornados como un archivo JSON desde la API /
        # convertida a una lista de Python usando sodapy
        client.timeout =1000
        results = client.get("erm2-nwe9", limit=100)

        with self.output().open('w') as json_file:
            json.dump(results, json_file)
コード例 #13
0
def fetch_res_data(zip,
                   max_query_results=20,
                   num_entries_to_search=10000,
                   t_out=10) -> Dict[str, any]:
    nyc_res_dataset_domain = "data.cityofnewyork.us"
    nyc_res_dataset_identifier = "43nn-pn8j"
    nyc_res_dataset_token = (
        None  # works with None but lower number of requests can be made
    )

    client = Socrata(nyc_res_dataset_domain, nyc_res_dataset_token)

    client.timeout = t_out

    try:
        return client.get(
            nyc_res_dataset_identifier,
            select="dba, boro, zipcode, violation_description",
            # q=str(zip), #uncomment if want to query directly on the server side (may lead to timeout)
            order="score DESC",
            limit=num_entries_to_search,
        )
    except requests.exceptions.Timeout:
        raise TimeoutError
コード例 #14
0
from Statewide_Payroll import Fraction_Statewide_Payroll
from DCP_Capital import get_DCP_capital
from Statewide_Fringe import get_statewide_fringe



helper_dir = "/Users/alexanderweinstein/Documents/Harris/Summer2020/Carceral_Budgeting/Exploratory/Agency_Classes/Agency_Helpers"
sys.path.insert(0, helper_dir)
from SOQL_Constructors import construct_expenditures_SOQL, construct_budget_SOQL, construct_payroll_SOQL, \
    construct_settlements_SOQL
from Find_Data import find_data
from CY_To_FY import convert_CY_to_FY

app_token = "2Qa1WiG8G4kj1vGVd2noK7zP0"
client = Socrata("cthru.data.socrata.com", app_token)
client.timeout = 40


class StateAgency(Agency):
    """Last updated July 10th to get revenue data into it's own dataframe
    Possible to do: return one summary dataframe instead of expenditures, budget, revenue by year
    Another to do: add client in initialize agencies code
    To do: fix how year range is set, it's getting passed from multiple places and creating conflicts
    Really, really need to fix this it's causing lots of bugs. Need to set year range from one place, when
    agency class is created, and have it all propogate
    Also: something strange is happending where once initialize agencies has been run and then I call it agian,
    the objects aren't re-initialized. Should figure out what is going on
    Actually, objects are getting intialized when I import initialize agencies, which isn't what I want."""

    def __init__(self, alias, official_name, year_range, category, correction_function=lambda x:x, settlement_agencies=None,
                 payroll_vendors=[], payroll_official_name=None, client=None,
コード例 #15
0
 def get_client(self):
     client = Socrata(self.socrata_domain, self.socrata_token)
     client.timeout = self.timeout
     return client
コード例 #16
0
ファイル: main.py プロジェクト: cmthomison/divvy-data
import geopandas as gpd
from shapely.geometry import Point, LineString
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import datetime as dt
import sqlite3
from sqlite3 import Error
from sqlalchemy import create_engine

from sodapy import Socrata
from prep import wrangle as wr

# Load data.
# None indicates no credentials required for public datasets.
client = Socrata("data.cityofchicago.org", None)
client.timeout = 120

# Get bikeshare records with sodapy.
results = client.get("fg6s-gzvg", limit=20000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

# Get all stations.
stations = results_df[['from_station_id', 'from_station_name', 'from_latitude',
                       'from_longitude']].reset_index(drop=True)
stations.drop_duplicates(inplace=True)

cols = stations.columns.tolist()
stations.columns = [x.split('_',1)[1] for x in cols]
コード例 #17
0
def main(
    dataset_id,
    table_name,
    database,
    socrata_username,
    socrata_password,
    where_clause,
    existing_table_rows="drop",
):
    """
    Read in dataset from Socrata and write output to Platform
    Parameters
    --------
    dataset_id: str
        Socrata dataset identifier
    table_name: str, optional
        destination table in Platform (schema.table)
    database: str, optional
        destination database in Platform
    socrata_username: str, optional
        username for socrata account, required for private data sets
    socrata_password: str, optional
        password for socrata account, required for private data sets
    where_clause: str, optional
        SoQL for filtering dataset
    existing_table_rows: str, optional
        options to pass to dataframe_to_civis command

    Outputs
    ------
    Adds data as file output
    and, if table_name and database are specified, writes data to Platform
    """

    socrata_client = Socrata(
        "data.lacity.org", None, username=socrata_username, password=socrata_password
    )

    socrata_client.timeout = 50

    raw_metadata = socrata_client.get_metadata(dataset_id)

    dataset = _read_paginated(socrata_client, dataset_id, where=where_clause)

    civis_client = civis.APIClient()

    if dataset.empty:
        msg = f"No rows returned for dataset {dataset_id}."
        LOG.warning(msg)
        write_and_attach_jsonvalue(json_value=msg, name="Error", client=civis_client)
    else:
        data_file_name = (
            f"{dataset_id}_extract_{datetime.now().strftime('%Y-%m-%d')}.csv"
        )
        file_id = _store_and_attach_dataset(
            client=civis_client, df=dataset, filename=data_file_name
        )
        LOG.info(f"add the {file_id}")

        if table_name:
            # Optionally start table upload
            LOG.info(f"Storing data in table {table_name} on database {database}")
            print("writing table")
            run_id = os.environ["CIVIS_RUN_ID"]
            job_id = os.environ["CIVIS_JOB_ID"]
            dataset["civis_job_id"] = job_id
            dataset["civis_run_id"] = run_id
            table_upload = civis.io.dataframe_to_civis(
                dataset,
                database=database,
                table=table_name,
                existing_table_rows=existing_table_rows,
            ).result()
            LOG.info(f"using {table_upload}")

    # Parse raw_metadata to extract useful fields and attach both raw and
    # cleaned metadata as script outputs
    metadata_file_name = (
        f"{dataset_id}_metadata_{datetime.now().strftime('%Y-%m-%d')}.json"
    )

    metadata_paths = {
        "Proposed access level": "metadata.custom_fields.Proposed Access Level.Proposed Access Level",  # noqa: E501
        "Description": "description",
        "Data updated at": "rowsUpdatedAt",
        "Data provided by": "tableAuthor.screenName",
    }

    _, clean_metadata = _store_and_attach_metadata(
        client=civis_client,
        metadata=raw_metadata,
        metadata_paths=metadata_paths,
        filename=metadata_file_name,
    )

    if table_name:
        sql = f'COMMENT ON TABLE {table_name} IS \'{clean_metadata["Description"]}\''
        civis.io.query_civis(
            sql, database=database, polling_interval=2, client=civis_client
        ).result()
コード例 #18
0
def main(
    socrata_client_url: str,
    dataset_id: str,
    civis_table_name: str,
    civis_database: str,
    database_type: str,
    socrata_username: str,
    socrata_password: str,
    grant_group: str,
    varchar_len: str = None,
    action_existing_table_rows: str = "drop",
):
    """
    Read in dataset from Socrata and write output to Platform

    Parameters
    --------
    socrata_client_url: str
        url of socrata portal being referenced
    dataset_id: str
        Socrata dataset identifier
    civis_table_name: str
        destination table in Platform (schema.table)
    civis_database: str
        destination database in Platform
    database_type: str
        type of destination database
    socrata_username: str, optional
        username for socrata account, required for private data sets
    socrata_password: str, optional
        password for socrata account, required for private data sets
    grant_group: str
        string of group(s) that are passed to civis API to be granted select
        table access
    varchar_len: str
        sets the varchar length when datatypes are passed to civis API, 256 is
        defualt
    action_existing_table_rows: str, optional
        options to pass to dataframe_to_civis command

    Outputs
    ------
    Adds data as file output and, if table_name and database are specified,
    writes data to Platform
    """

    socrata_client = Socrata(socrata_client_url,
                             None,
                             username=socrata_username,
                             password=socrata_password)
    # define socrata cleint

    civis_client = civis.APIClient()
    # define civis cleint

    socrata_client.timeout = 50

    sample_data = socrata_client.get(dataset_id,
                                     limit=5,
                                     content_type="csv",
                                     exclude_system_fields=False,
                                     offset=0)
    # collects sample data from dataset

    sample_data_df = results_to_df(sample_data)
    # writes sample data to dataframe

    if sample_data_df.empty:
        msg = f"No rows returned for dataset {dataset_id}."
        LOG.warning(msg)
        write_and_attach_jsonvalue(json_value=msg,
                                   name="Error",
                                   client=civis_client)
        os._exit(1)
    # provides exit if no rows avalible in dataset

    raw_metadata = socrata_client.get_metadata(dataset_id)
    # calls for raw metadata

    sql_type = select_sql_map(database_type, varchar_len)
    # defines apropriate sql types for datatype mapping depending on
    # specifications

    (
        civis_table_columns,
        point_columns,
        pandas_column_order,
        extra_columns,
    ) = create_col_type_dict(raw_metadata, sample_data_df, sql_type)
    # creates civis specific array of dicts that maps column name to
    # datatype using socrata metadata as guidence. Also, provides point
    # columns that are used to clean point column formatting during import.
    # And, provides array of columns that corresponds to order of the mapping
    # dict (civis_file_to_table is sensitive to order.

    print("Columns present in Metadata but not in data:", extra_columns)

    consolidated_csv_path = _read_paginated(
        client=socrata_client,
        dataset_id=dataset_id,
        point_columns=point_columns,
        column_order=pandas_column_order,
    )
    # reads in socrata data in chunks (using offset and page_limit), and
    # appenda all to one csv and outputs path here

    data_file_name = f"{dataset_id}_extract_{datetime.now().strftime('%Y-%m-%d')}.csv"
    uploaded_file_id = _store_and_attach_dataset_csv(
        client=civis_client,
        csv_path=consolidated_csv_path,
        filename=data_file_name)
    print("file_id:", uploaded_file_id)
    LOG.info(f"add the {uploaded_file_id}")

    LOG.info(
        f"Storing data in table {civis_table_name} on database {civis_database}"
    )

    table_upload = civis.io.civis_file_to_table(
        file_id=uploaded_file_id,
        database=civis_database,
        table=civis_table_name,
        table_columns=civis_table_columns,
        existing_table_rows=action_existing_table_rows,
        headers=True,
    ).result()
    LOG.info(f"using {table_upload}")
    # takes in file id and writes to table

    metadata_file_name = (
        f"{dataset_id}_metadata_{datetime.now().strftime('%Y-%m-%d')}.json")
    # parse raw_metadata to extract useful fields and attach both raw and
    # cleaned metadata as script outputs

    upload_metadata_paths = {
        "Description": "description",
        "Data updated at": "rowsUpdatedAt",
        "Data provided by": "tableAuthor.screenName",
    }

    _, clean_metadata = _store_and_attach_metadata(
        client=civis_client,
        metadata=raw_metadata,
        metadata_paths=upload_metadata_paths,
        filename=metadata_file_name,
    )

    if civis_table_name:
        sql = f"""
                COMMENT ON TABLE {civis_table_name} IS
                \'{clean_metadata["Description"]}\'
                 """
        civis.io.query_civis(sql,
                             database=civis_database,
                             polling_interval=2,
                             client=civis_client).result()

    if grant_group:
        sql = f"GRANT ALL ON {civis_table_name} TO GROUP {grant_group}"
        civis.io.query_civis(sql,
                             database=civis_database,
                             polling_interval=2,
                             client=civis_client).result()
コード例 #19
0
def controllerCenter(allIDS):

    flagCounter = 0
    limit = 10

    token = 'sC4N6wXghMXaL2C3uUxVMphf0'
    client = Socrata('www.datos.gov.co',
                     token,
                     username="******",
                     password="******")

    client.timeout = 180

    print("Conjuntos de datos a evaluar", len(allIDS))

    for i in allIDS:

        resultado = ""
        flagCounter += 1

        try:

            try:

                # Generando los insumos del proceso.

                datasetURL = "https://www.datos.gov.co/resource/{}.json".format(i)

                print(flagCounter)
                print(datasetURL)

                # Valida URL y si es accesible
                statusDataset = requests.get(datasetURL, timeout=60)

                # Obtiene información del Dataset
                datosDataSet = client.get(i, limit=limit)

                # Obteniendo Metadatos del Dataset
                metaDataset = client.get_metadata(i)

                # Reviando que el Dataset no esté vacio 
                frametoValidate = pd.DataFrame.from_records(datosDataSet)

                # Validando DataSet
                metaData = metaDataset['metadata']


            except KeyError as error:

                print(error)
                logging.error(str(error))
                indexCompletitud = 0
                indexCredibilidad = 0
                indexActualidad = 0
                indexTrazabilidad = 0
                indexDisponibildiad = 0
                indexConformidad = 0
                indexComprensibilidad = 0
                indexPortabilidad = 0
                indexConsistencia = 0
                indexExactitud = 0

            except TimeoutError as error:

                logging.error(str(error))

            except requests.exceptions.ConnectionError as error:

                logging.error(str(error))


            else:

                # Cálculo de indicadores para cada conjunto de datos evaluado

                if statusDataset.status_code == 200 and frametoValidate.empty == False:

                    # Se genera primer indicador de disponibilidad
                    resultado = str(i)

                    # Se genera primer indicador de disponibilidad
                    resultado = resultado + ';' + str(10)

                    # Creación de la Instancia
                    evaluation = Evaluation()

                    # Indicador Completitud
                    indexCompletitud = evaluation.indicadorCompletitud(frametoValidate)
                    resultado = resultado + ',' + str(indexCompletitud)

                    # Indicador Actualidad
                    indexActualidad = evaluation.indicadorActualidad(metaDataset, metaData)
                    resultado = resultado + ',' + str(indexActualidad)

                    # Indicador Credibilidad
                    indexCredibilidad = evaluation.indicadorCredibilidad(metaDataset)
                    resultado = resultado + ',' + str(indexCredibilidad)

                    # Indicador Trazabilidad
                    indexTrazabilidad = evaluation.indicadorTrazabilidad(metaDataset)
                    resultado = resultado + ',' + str(indexTrazabilidad)

                    # Indicador Conformidad
                    indexConformidad = evaluation.indicadorConformidad(metaDataset)
                    resultado = resultado + ',' + str(indexConformidad)

                    # Indicador Comprensibilidad
                    indexComprensibilidad = evaluation.indicadorComprensibilidad(metaDataset, frametoValidate)
                    resultado = resultado + ',' + str(indexComprensibilidad)

                    # Indicador Portabilidad
                    indexPortabilidad = evaluation.indicadorPortabilidad(datosDataSet)
                    resultado = resultado + ',' + str(indexPortabilidad)

                    # Indicador Consistencia
                    indexConsistencia = evaluation.indicadorConsisetencia(frametoValidate)
                    resultado = resultado + ',' + str(indexConsistencia)
                    # INdicador Exactitud
                    indexExactitud = evaluation.indicadorExactitud(metaDataset, frametoValidate)
                    resultado = resultado + ',' + str(indexExactitud)

                else:

                    print("error")

            finally:


                with open("Quality_Indicators.csv", 'a', encoding='UTF-8') as qIndicators:
                    qIndicators.write(str(resultado))
                    qIndicators.write('\n')

        except BrokenPipeError as errorBroken:

            logging.error(str(errorBroken))
            resultado = str(i) + "0,0,0,0,0,0,0,0,0,0"
            with open("Quality_Indicators.csv", 'a', encoding='UTF-8') as qIndicators:
                qIndicators.write(str(resultado))
                qIndicators.write('\n')
コード例 #20
0
    args = parser.parse_args()

    start_date = args.StartDate
    end_date = args.EndDate
    if args.SaveDir is None:
        save_dir = "/home/jtl/Dropbox (MIT)/DOE_TSMS/00_Data/raw_data/mobility/"
    else:
        save_dir = args.SaveDir

    apitoken = cityofchicago["apitoken"]
    username = cityofchicago["username"]
    pwd = cityofchicago["pwd"]

    client = Socrata("data.cityofchicago.org", apitoken, username, pwd)
    # automatic timeout is 10s, increase to 2 hours
    client.timeout = 7200

    for y, m in [(i.year, i.month)
                 for i in pd.period_range(start_date, end_date, freq="M")]:
        print("Downloading %d-%d" % (y, m))

        time = datetime.datetime.now()
        # slow
        # results = client.get("m6dm-c72p", where="date_extract_y(trip_start_timestamp) = " + str(y) +" AND date_extract_m(trip_start_timestamp)= " + str(m), content_type="csv", limit=2000)
        if m < 10:
            pad = '0'
        else:
            pad = ''
        (y1, m1) = calendar.nextmonth(year=y, month=m)
        if m1 < 10:
            pad1 = '0'
コード例 #21
0
ファイル: main.py プロジェクト: mengyan0803/STA9760Project1
from sodapy import Socrata
import requests
from requests import HTTPError
import json
from datetime import datetime
from elasticsearch import Elasticsearch
from time import sleep

DATA_URL = "data.cityofnewyork.us"
DATA_ID = 'nc67-uf89'

app_token = os.environ.get("APP_TOKEN")

client = Socrata(DATA_URL, app_token)

client.timeout = 60


def create_and_update_index(index_name):
    es = Elasticsearch()
    try:
        es.indices.create(index=index_name)
    except Exception:
        pass
    return es


def data_formatting(datastring):
    for key, value in datastring.items():
        if 'amount' in key:
            datastring[key] = float(value)