コード例 #1
0
ファイル: generate.py プロジェクト: yammadev/col-covid-19
def main():
    # [1] Get data
    # Unauthenticated client only works with public data sets. Note 'None'
    # in place of application token, and no username or password:
    client = Socrata('www.datos.gov.co', None)

    # [2] Results
    # Results returned as JSON from API / converted to Python list of
    # dictionaries by sodapy.
    records, samples = client.get_all('gt2j-8ykr'), client.get_all('8835-5baf')
    # records, samples = client.get('gt2j-8ykr', limit = 2000), client.get('8835-5baf', limit = 2000)

    # [3] Convert
    # Convert to pandas DataFrame
    records, samples = pd.DataFrame.from_records(
        records), pd.DataFrame.from_records(samples)

    # [*] Columns
    # print(records.columns)
    # ['id_de_caso', 'fecha_de_notificaci_n', 'c_digo_divipola',
    #   'ciudad_de_ubicaci_n', 'departamento', 'atenci_n', 'edad', 'sexo',
    #   'tipo', 'estado', 'pa_s_de_procedencia', 'fis', 'fecha_diagnostico',
    #   'fecha_recuperado', 'fecha_reporte_web', 'tipo_recuperaci_n',
    #   'codigo_departamento', 'codigo_pais', 'pertenencia_etnica',
    #   'fecha_de_muerte', 'nombre_grupo_etnico']

    # print(samples.columns)
    # ['fecha', 'acumuladas', 'amazonas', 'antioquia', 'arauca', 'atlantico',
    #   'bogota', 'bolivar', 'boyaca', 'caldas', 'caqueta', 'casanare', 'cauca',
    #   'cesar', 'choco', 'cordoba', 'cundinamarca', 'guainia', 'guajira',
    #   'guaviare', 'huila', 'magdalena', 'meta', 'narino',
    #   'norte_de_santander', 'putumayo', 'quindio', 'risaralda', 'san_andres',
    #   'santander', 'sucre', 'tolima', 'valle_del_cauca', 'vaupes', 'vichada',
    #   'procedencia_desconocida', 'positivas_acumuladas',
    #   'negativas_acumuladas', 'positividad_acumulada', 'indeterminadas',
    #   'barranquilla', 'cartagena', 'santa_marta']

    # [4] Get desired columns
    samples = samples[['fecha', 'acumuladas']]

    # [5] Reset columns
    records.columns = [
        'CASE', 'NOTIFICATION_DATE', 'COD_DIVIPOLA', 'CITY', 'DEPARTAMENT',
        'STATUS', 'AGE', 'GENDER', 'KIND', 'LEVEL', 'ORIGIN',
        'SYMPTOMS_BEGINNING_DATE', 'DIAGNOSIS_DATE', 'RECOVERED_DATE',
        'REPORT_DATE', 'KIND_OF_RECOVERY', 'DEPARTAMENT_CODE', 'COUNTRY_CODE',
        'ETHNICITY', 'DEATH_DATE', 'ETHNIC_GROUP_NAME'
    ]
    samples.columns = ['DATE', 'ACCUMULATED']

    # [6] Export!
    list(records)
    processed(samples)
    statistics(records)
    timeline(records)

    # [7] Plot!
    plot()
コード例 #2
0
def test_get_all():
    mock_adapter = {}
    mock_adapter["prefix"] = PREFIX
    adapter = requests_mock.Adapter()
    mock_adapter["adapter"] = adapter
    client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter)

    setup_mock(adapter,
               "GET",
               "bike_counts_page_1.json",
               200,
               query="$offset=0")
    setup_mock(adapter,
               "GET",
               "bike_counts_page_2.json",
               200,
               query="$offset=1000")
    response = client.get_all(DATASET_IDENTIFIER)

    assert inspect.isgenerator(response)
    data = list(response)
    assert len(data) == 1001
    assert data[0]["date"] == "2016-09-21T15:45:00.000"
    assert data[-1]["date"] == "2016-10-02T01:45:00.000"

    client.close()
コード例 #3
0
ファイル: app.py プロジェクト: VicData/covid19-co
def lambda_handler(event, context):
    s3_client= boto3.client('s3')
    s3_resource =  boto3.resource('s3')
    client = Socrata("www.datos.gov.co", None)
    results = client.get_all(os.environ['DATASET'])
    tmp_path="/tmp/"
    file_prefix=os.environ['FILE_PREFIX']
    now = datetime.now(pytz.timezone("America/Bogota"))
    deit=now.strftime("%Y%m%d")
    bucket_path=os.environ['BUCKET_PATH']
    bucket=os.environ['BUCKET']
    local_file='%s%s_%s.json'%(tmp_path,file_prefix,deit)
    s3_file='%s%s_%s.json'%(bucket_path,file_prefix,deit)
    with open(local_file,'w') as f:
        for item in results:
            json.dump(item,f,ensure_ascii=False)
            f.write("\n")
    
    with open (local_file, "r") as myfile:
        data=myfile.readlines()
    vec=len(data)
    s3_client.upload_file(local_file, bucket, s3_file)    
    uploaded=True
    print("Upload Successful from %s to bucket %s in path %s"%(local_file, bucket, s3_file))
    return {
        "statusCode": 200,
        "body": json.dumps({"results":vec,"uploaded":uploaded}),
    }
コード例 #4
0
    def cdc_data(self):
        start_date_iso = datetime.combine(START_DATE, datetime.min.time()).isoformat()
        parameters = {
            '$where': "date >= '{}'".format(start_date_iso),
        }

        client = Socrata(EXTRACT_URL, SODA_APP_TOKEN)
        return client.get_all(DATASET_ID, **parameters)
コード例 #5
0
def get_raw_data():
    # Instruction from the CDC website to extract data
    # Unauthenticated client only works with public data sets. Note "None"
    # in place of application token, and no username or password:
    client = Socrata("chronicdata.cdc.gov", None)

    # Return as JSON from API / converted to Python list of
    # dictionaries by sodapy.
    results = client.get_all("us8e-ubyj")

    # Convert to pandas DataFrame
    return pd.DataFrame.from_records(results)
コード例 #6
0
def extract_api(url, key, dataset_id):
    date = datetime.today().strftime('%Y-%m-%d')
    client = Socrata(f"{url}", f"{key}")
    print("extracting data from NYC Open Data . .\n")
    results = client.get_all(f"{dataset_id}", content_type="csv")
    #write data to local file
    row_count = 0
    with open(f'restaurants_{date}.csv', 'w') as f:
        writer = csv.writer(f)
        for item in results:
            if row_count != 0 and item[0] != 'camis':
                writer.writerow(item)
            elif row_count == 0 and item[0] == 'camis':
                writer.writerow(item)
            row_count += 1
    return f'restaurants_{date}.csv'
コード例 #7
0
ファイル: etl.py プロジェクト: mikss/nypdcd
def fetch_complaints(
    *endpoints: Iterable[str], how: str = "url", write: bool = False
) -> List[Union[pd.DataFrame, str]]:
    """Retrieve NYPD complaints data and optionally write to disk."""
    latest_path = os.path.join(os.getenv("REPO_ROOT"), "data", str(time.time_ns()))

    complaints = []
    for endpoint in endpoints:
        if how == "api":

            def _reader(data):
                return pd.DataFrame.from_records(data)

            def _writer(data, out_path):
                with open(out_path, "w") as f:
                    json.dump(list(data), f)

            ext = "json"
            client = Socrata("data.cityofnewyork.us", None)
            data = client.get_all(endpoint)

        elif how == "url":

            def _reader(data):
                return pd.read_csv(BytesIO(data.content))

            def _writer(data, out_path):
                with open(out_path, "wb") as f:
                    f.write(data.content)

            ext = "csv"
            data = requests.get(f"https://data.cityofnewyork.us/api/views/{endpoint}/rows.csv")

        else:
            raise NotImplementedError("The keyword argument `how` must be one of 'api' or 'url'.")

        if write:
            Path(latest_path).mkdir(parents=True, exist_ok=True)
            out_path = os.path.join(latest_path, f"{endpoint}.{ext}")
            _writer(data, out_path)
            complaints.append(out_path)

        else:
            complaints.append(_reader(data))

    return complaints
コード例 #8
0
def clean_data():
    """
    This method takes in our API of the dataset and cleans all
    1.5 million 911 fire calls so that the returned dataframe will
    be only calls from the last 365 days.
    """
    client = Socrata("data.seattle.gov", None)

    results = client.get_all("kzjm-xkqj")

    df = pd.DataFrame.from_records(results)
    time = df["datetime"] >= "2019-08-21T00:00:00.000"
    df = df[time]

    cleaned = df.loc[:, ["address", "type", "datetime"]]

    return cleaned
コード例 #9
0
def fireworks_data_loader():
    '''
    Loads in fireworks data from NYC 311.
    https://data.cityofnewyork.us/Social-Services/311-Fireworks-Complaints/g4u2-tvag
    '''

    client = Socrata('data.cityofnewyork.us', None)
    results = client.get_all('g4u2-tvag')

    df = pd.DataFrame.from_records(results)

    df['created_date'] = pd.to_datetime(df['created_date'], errors='coerce')
    df['fireworks'] = [
        1 if complaint == 'Illegal Fireworks' else 0
        for complaint in df['complaint_type']
    ]

    return df
コード例 #10
0
def read_data(year, mode='offline'):
    name_map = {"id":'ID',
        "case_number":'Case Number',
        "date":'Date',
        "block":'Block',
        "iucr":'IUCR',
        "primary_type":'Primary Type',
        "description":'Description',
        "location_description":'Location Description',
        "arrest":'Arrest',
        "domestic":'Domestic',
        "beat":'Beat',
        "district":'District',
        "ward":'Ward',
        "community_area":'Community Area',
        "fbi_code":'FBI Code',
        "year":'Year',
        "updated_on":'Updated On',
        "x_coordinate":'X Coordinate',
        "y_coordinate":'Y Coordinate',
        "latitude":'Latitude',
        "longitude":'Longitude',
        "location":'Location'
    }
    if mode == 'offline':
        try:
            csv_cache = st.cache(pd.read_csv)
            results = csv_cache('https://raw.githubusercontent.com/CMU-IDS-2020/a3-05839-a3-fch-ljy/master/subset.csv')
            return results[results.loc[:,'Year']==year]
        except: # For testing
            st.write('Incomplete Data Readed, Only for testing')
            client = Socrata("data.cityofchicago.org", None)
            results = client.get("ijzp-q8t2", where="year={:d}".format(year), limit=100000)
            results = pd.DataFrame.from_records(results)
            results.columns = [name_map[name] for name in list(results.columns)]
            return results[results.loc[:,'Year']==str(year)]
    else:
        client = Socrata("data.cityofchicago.org", None)
        results = client.get_all("ijzp-q8t2", where="year={:d}".format(year))
        results = pd.DataFrame.from_records(results)
        results.columns = [name_map[name] for name in list(results.columns)]
        return results[results.loc[:,'Year']==str(year)]
コード例 #11
0
ファイル: data.py プロジェクト: kkliu5/covid_app
def create_case_death_df():
    """Case & Death Data ~20s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get_all("9mfq-cb36")
    df_case_death = pd.DataFrame.from_records(results)
    # Changing data types
    df_case_death['submission_date'] = df_case_death['submission_date'].astype(
        'datetime64')
    df_case_death['state'] = df_case_death['state'].astype('string')
    df_case_death['tot_cases'] = df_case_death['tot_cases'].astype('float')
    df_case_death['conf_cases'] = df_case_death['conf_cases'].astype('float')
    df_case_death['prob_cases'] = df_case_death['prob_cases'].astype('float')
    df_case_death['new_case'] = df_case_death['new_case'].astype('float')
    df_case_death['pnew_case'] = df_case_death['pnew_case'].astype('float')
    df_case_death['tot_death'] = df_case_death['tot_death'].astype('float')
    df_case_death['conf_death'] = df_case_death['conf_death'].astype('float')
    df_case_death['prob_death'] = df_case_death['prob_death'].astype('float')
    df_case_death['new_death'] = df_case_death['new_death'].astype('float')
    df_case_death['pnew_death'] = df_case_death['pnew_death'].astype('float')
    df_case_death['created_at'] = df_case_death['created_at'].astype(
        'datetime64')
    #aggregating data based on submission date
    df_case_death_agg = df_case_death.groupby('submission_date',
                                              as_index=False).sum()
    df_case_death_agg['7d_ra_case'] = df_case_death_agg['new_case'].rolling(
        7).mean()
    df_case_death_agg['30d_ra_case'] = df_case_death_agg['new_case'].rolling(
        30).mean()
    df_case_death_agg['7d_ra_death'] = df_case_death_agg['new_death'].rolling(
        7).mean()
    df_case_death_agg['30d_ra_death'] = df_case_death_agg['new_death'].rolling(
        30).mean()
    # sorting data
    df_case_death_agg = df_case_death_agg.sort_values(by=['submission_date'],
                                                      ascending=False,
                                                      ignore_index=True)

    return df_case_death_agg
コード例 #12
0
ファイル: data.py プロジェクト: kkliu5/covid_app
def create_hos_df():
    """Case Status ~2s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get(
        "vbim-akqf",
        select=["current_status, count('cdc_case_earliest_dt')"],
        group=["current_status"])
    df_hos_1 = pd.DataFrame.from_records(results)
    df_hos_1['current_status'] = df_hos_1['current_status'].astype('string')
    df_hos_1['count_cdc_case_earliest_dt'] = df_hos_1[
        'count_cdc_case_earliest_dt'].astype('float')
    """Cases by Sex ~1s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get("vbim-akqf",
                         select=["sex, count('cdc_case_earliest_dt')"],
                         group=["sex"])
    df_hos_2 = pd.DataFrame.from_records(results)
    df_hos_2['sex'] = df_hos_2['sex'].astype('string')
    df_hos_2['count_cdc_case_earliest_dt'] = df_hos_2[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_2['sex'] = df_hos_2['sex'].str.replace('Unknown', 'Missing')
    df_hos_2['sex'] = df_hos_2['sex'].str.replace('NA', 'Missing')
    df_hos_2 = df_hos_2.groupby('sex', as_index=False).sum()
    """Cases by Age Group ~1s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get("vbim-akqf",
                         select=["age_group, count('cdc_case_earliest_dt')"],
                         group=["age_group"])
    df_hos_3 = pd.DataFrame.from_records(results)
    df_hos_3['age_group'] = df_hos_3['age_group'].astype('string')
    df_hos_3['count_cdc_case_earliest_dt'] = df_hos_3[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_3['age_group'] = df_hos_3['age_group'].str.replace('NA', 'Missing')
    df_hos_3 = df_hos_3.groupby('age_group', as_index=False).sum()
    """Cases by Race ~1s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get(
        "vbim-akqf",
        select=["race_ethnicity_combined, count('cdc_case_earliest_dt')"],
        group=["race_ethnicity_combined"])
    df_hos_4 = pd.DataFrame.from_records(results)
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].astype('string')
    df_hos_4['count_cdc_case_earliest_dt'] = df_hos_4[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace(
            'American Indian/Alaska Native, Non-Hispanic',
            'American Indian/Alaska Native')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace('Asian, Non-Hispanic', 'Asian')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace('Black, Non-Hispanic', 'Black')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace('Multiple/Other, Non-Hispanic',
                                               'Multiple/Other')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace(
            'Native Hawaiian/Other Pacific Islander, Non-Hispanic',
            'Native Hawaiian/Other Pacific Islander')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace('White, Non-Hispanic', 'White')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace('NA', 'Missing')
    df_hos_4['race_ethnicity_combined'] = df_hos_4[
        'race_ethnicity_combined'].str.replace('Unknown', 'Missing')
    df_hos_4 = df_hos_4.groupby('race_ethnicity_combined',
                                as_index=False).sum()
    """Case by Hospitalzation ~1s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get("vbim-akqf",
                         select=["hosp_yn, count('cdc_case_earliest_dt')"],
                         group=["hosp_yn"])
    df_hos_5 = pd.DataFrame.from_records(results)
    df_hos_5['hosp_yn'] = df_hos_5['hosp_yn'].astype('string')
    df_hos_5['count_cdc_case_earliest_dt'] = df_hos_5[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_5['hosp_yn'] = df_hos_5['hosp_yn'].str.replace('Unknown', 'Missing')
    df_hos_5 = df_hos_5.groupby('hosp_yn', as_index=False).sum()
    """ Cases by ICU ~1s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get("vbim-akqf",
                         select=["icu_yn, count('cdc_case_earliest_dt')"],
                         group=["icu_yn"])
    df_hos_6 = pd.DataFrame.from_records(results)
    df_hos_6['icu_yn'] = df_hos_6['icu_yn'].astype('string')
    df_hos_6['count_cdc_case_earliest_dt'] = df_hos_6[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_6['icu_yn'] = df_hos_6['icu_yn'].str.replace('Unknown', 'Missing')
    df_hos_6['icu_yn'] = df_hos_6['icu_yn'].str.replace('nul', 'Missing')
    df_hos_6 = df_hos_6.groupby('icu_yn', as_index=False).sum()
    """Cases by Death Status ~1s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get("vbim-akqf",
                         select=["death_yn, count('cdc_case_earliest_dt')"],
                         group=["death_yn"])
    df_hos_7 = pd.DataFrame.from_records(results)
    df_hos_7['death_yn'] = df_hos_7['death_yn'].astype('string')
    df_hos_7['count_cdc_case_earliest_dt'] = df_hos_7[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_7['death_yn'] = df_hos_7['death_yn'].str.replace(
        'Unknown', 'Missing')
    df_hos_7 = df_hos_7.groupby('death_yn', as_index=False).sum()
    """Cases with Medical Status ~1s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get("vbim-akqf",
                         select=["medcond_yn, count('cdc_case_earliest_dt')"],
                         group=["medcond_yn"])
    df_hos_8 = pd.DataFrame.from_records(results)
    df_hos_8['medcond_yn'] = df_hos_8['medcond_yn'].astype('string')
    df_hos_8['count_cdc_case_earliest_dt'] = df_hos_8[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_8['medcond_yn'] = df_hos_8['medcond_yn'].str.replace(
        'Unknown', 'Missing')
    df_hos_8 = df_hos_8.groupby('medcond_yn', as_index=False).sum()

    #     """ICU Cases Over Time ~5s"""
    #     client = Socrata("data.cdc.gov", app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc", username="******", password="******")
    #     results = client.get_all("vbim-akqf", select = ["icu_yn, cdc_case_earliest_dt, count('cdc_case_earliest_dt')"], group = ["icu_yn, cdc_case_earliest_dt"])
    #     df_hos_9 = pd.DataFrame.from_records(results)
    #     df_hos_9['icu_yn'] = df_hos_9['icu_yn'].astype('string')
    #     df_hos_9['cdc_case_earliest_dt'] = df_hos_9['cdc_case_earliest_dt'].astype('datetime64')
    #     df_hos_9['count_cdc_case_earliest_dt'] = df_hos_9['count_cdc_case_earliest_dt'].astype('float')
    #     df_hos_9 = df_hos_9[df_hos_9['icu_yn'] == 'Yes']
    """Hospitalized Cases Over Time ~2s"""
    client = Socrata("data.cdc.gov",
                     app_token="SMDNVaBjBRb2aY7ZjRLbnLpZc",
                     username="******",
                     password="******")
    results = client.get_all(
        "vbim-akqf",
        select=[
            "hosp_yn, cdc_case_earliest_dt, count('cdc_case_earliest_dt')"
        ],
        group=["hosp_yn, cdc_case_earliest_dt"])
    df_hos_10 = pd.DataFrame.from_records(results)
    df_hos_10['hosp_yn'] = df_hos_10['hosp_yn'].astype('string')
    df_hos_10['cdc_case_earliest_dt'] = df_hos_10[
        'cdc_case_earliest_dt'].astype('datetime64')
    df_hos_10['count_cdc_case_earliest_dt'] = df_hos_10[
        'count_cdc_case_earliest_dt'].astype('float')
    df_hos_10 = df_hos_10[df_hos_10['hosp_yn'] == 'Yes']

    return df_hos_1, df_hos_2, df_hos_3, df_hos_4, df_hos_5, df_hos_6, df_hos_7, df_hos_8, df_hos_10
コード例 #13
0
covid_id = "xdss-u53e"

client = Socrata(domain, token)

site_last_updated = 0
if "site-last-updated" in os.listdir("."):
    with open("site-last-updated", "r") as f:
        site_last_updated = int(f.read())

metadata = client.get_metadata(covid_id)
if data_last_updated := metadata.get("rowsUpdatedAt"):
    data_last_updated = int(data_last_updated)
    if site_last_updated >= data_last_updated:
        sys.exit(0)

testing_data = client.get_all(covid_id,
                              select="county, test_date, new_positives")

print("Cleaning data")

testing_data = [{
    "county":
    c["county"].lower(),
    "test_date":
    datetime.strptime(c["test_date"], "%Y-%m-%dT%H:%M:%S.%f"),
    "new_positives":
    int(c["new_positives"]),
} for c in testing_data]

with open("ny-county-populations-2019.csv", "r") as f:
    population_data = [r.split(",") for r in f.read().strip().split("\n")][1:]
コード例 #14
0
            pad = '0'
        else:
            pad = ''
        (y1, m1) = calendar.nextmonth(year=y, month=m)
        if m1 < 10:
            pad1 = '0'
        else:
            pad1 = ''

        s = str(y) + '-' + pad + str(m) + '-01'
        e = str(y1) + '-' + pad1 + str(m1) + '-01'

        d = calendar.monthrange(y, m)[1]

        results = client.get_all("m6dm-c72p",
                                 where="trip_start_timestamp >= \"" + s +
                                 "\" AND trip_start_timestamp < \"" + e + "\"",
                                 content_type="csv")

        results = pd.DataFrame(results)
        results.columns = results.iloc[0]

        # the results have headers on each page therefore header rows are interleaved in the dataframe
        # filter these header rows out
        results = results[results['trip_id'] != 'trip_id']

        print("Number of Entries: %d " % (len(results)))
        print("Time Taken Download", datetime.datetime.now() - time)
        time = datetime.datetime.now()

        results.to_csv(save_dir + "chicago_tnc_" + str(y) + pad + str(m) +
                       "01_" + str(y) + pad + str(m) + str(d) + ".csv",
コード例 #15
0
def Get_Data():
    #connecting to the MongoDB Database
    mongo_client = MongoClient()
    #creating the MongoDB
    mydb = mongo_client['health_air_mongo']

    # Instruction from the CDC website to extract data
    # Unauthenticated client only works with public data sets. Note 'None'
    # in place of application token, and no username or password:
    client = Socrata("chronicdata.cdc.gov", None)

    # Return as JSON from API / converted to Python list of
    # dictionaries by sodapy.
    results = client.get_all("us8e-ubyj")

    # Convert to pandas DataFrame
    asthma_df = pd.DataFrame.from_records(results)

    #list containing the questions of interest for our project
    questions_OI = ['Asthma mortality rate',\
                    'Emergency department visit rate for asthma',\
                    'Hospitalizations for asthma',\
                    'Current asthma prevalence among adults aged >= 18 years',\
                    'Asthma prevalence among women aged 18-44 years']

    #retrieving the unique ID per question so that we address eventual spelling mistakes
    questionids_list=[]
    for question in questions_OI:
        questionids_list += list(asthma_df[asthma_df['question']==question]['questionid'].unique())

    # filtering the dataset for the questions of interest
    filtered_asthma_df = asthma_df[asthma_df['questionid'].isin(questionids_list)]

    #checking if column 'yearstart' and 'yearend' are the same
    print(f"Are the columns datavalue and datavaluealt end the same?\n{filtered_asthma_df['datavalue'].equals(filtered_asthma_df['datavaluealt'])}")
    print(f"Are the columns yearstart and year end the same?\n{filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend'])}")
    #if these columns are the same drop one of the duplicates and other not useful columns
    if filtered_asthma_df['yearstart'].equals(filtered_asthma_df['yearend']):
        columns_to_drop = ['yearend',\
                            'topic',\
                            'datavaluealt',\
                            'topicid',\               
                            'datavaluetypeid',\
                            'stratificationcategoryid1',\
                            'stratificationid1',\
                            'lowconfidencelimit',\
                        'highconfidencelimit',\
                        'datavaluefootnotesymbol',\
                        'datavaluefootnote']
        filtered_asthma_df=filtered_asthma_df.drop(columns=columns_to_drop)
        print(f"The following columns {columns_to_drop} were dropped")

    #renaming some columns
    filtered_asthma_df=filtered_asthma_df.rename(columns={"yearstart": "year","locationabbr":"state_id","locationdesc":"state"})

    # removing missing data values
    filtered_asthma_df=filtered_asthma_df[filtered_asthma_df['datavalue'].isna()==False]

    # drop territories (PR, GU, VI) and nation-wide data (US)
    state_to_drop = ['PR','GU','US','VI']
    filtered_asthma_df = filtered_asthma_df[~filtered_asthma_df['state_id'].isin(state_to_drop)]

    #checking for duplicates
    filtered_asthma_df=filtered_asthma_df.drop_duplicates()

    #keeping only the Overall values
    filtered_asthma_df = filtered_asthma_df[filtered_asthma_df['stratificationcategory1'] == 'Overall']

    # final dropping of unnecessary columns
    filtered_asthma_df=filtered_asthma_df.drop(columns=['state',\
                                                        'datasource',\
                                                        'question',\
                                                        'stratificationcategory1',\
                                                        'stratification1',\
                                                        'locationid',\
                                                        'questionid'])


    # adding collection about asthma to MongoDB
    add_collection_to_mongo('asthma',filtered_asthma_df)

    #finding the name of all the files in the air quality folder
    all_files = glob.glob('./Air_quality_csv/*.csv')

    air_list = []
    #reading all the CSV files
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        air_list.append(df)
    #creating the concatenated dataframe
    air_df = pd.concat(air_list, axis=0, ignore_index=True)

    # there is the need to aggregate the data by state and year
    # The output is the average number of days that were over the law limits per parameter of interest (i.e. PMx)
    # The average median and maximum API per state in a year is evaluated as well.
    aggregate_air_df = air_df.groupby(by=['State','Year']).mean()
    air_df = aggregate_air_df.reset_index()

    #dropping not useful column 
    columns_to_drop = ['Days with AQI',\
                    'Good Days',\
                    'Moderate Days',\
                    'Unhealthy for Sensitive Groups Days',\
                    'Unhealthy Days',\
                    'Very Unhealthy Days',\
                    'Hazardous Days',\
                    '90th Percentile AQI',\
                    'Days CO',\
                    'Days NO2',\
                    'Days SO2',]
    air_df=air_df.drop(columns=columns_to_drop)

    #converting the number of days in integers because as float they do not make too much sense
    for column in air_df.columns:
        if air_df[column].dtype == float:
            air_df[column]= air_df[column].astype('int64')

    #removing the point from the key for handling the dataset in MongoDb without issues
    air_df=air_df.rename(columns={"Days PM2.5": "Days PM2_5"})

    # adding collection about asthma to MongoDB
    add_collection_to_mongo('air',air_df)

    #closing the connections to sqlite and MongoDB
    mongo_client.close()
    return(print('Data successfully imported!'))