Ejemplo n.º 1
0
def get_nps_work_groups(connection_txt):

    engine = query.connect_db(connection_txt)

    with engine.connect() as conn, conn.begin():
        work_groups = pd.read_sql("SELECT DISTINCT work_group FROM nps_vehicles;", conn)\
            .squeeze() # Should only return one column so make it a Series

    return work_groups[~work_groups.isnull()].sort_values().tolist()
Ejemplo n.º 2
0
def main(data_dir,
         sqlite_path,
         connection_txt,
         archive_dir="",
         data_files=None):

    sys.stdout.write("Log file for %s\n%s\n\n" %
                     (__file__, datetime.now().strftime('%H:%M:%S %m/%d/%Y')))
    sys.stdout.write('Command: python %s\n\n' %
                     subprocess.list2cmdline(sys.argv))
    sys.stdout.flush()

    postgres_engine = connect_db(connection_txt)
    sqlite_engine = create_engine("sqlite:///" + sqlite_path)

    # Check if this date already exists in the shift_info table.
    #  Wrap the whole import block in the with .connect() context manager to run everything as a transaction that is
    #  automatically rolled back on error
    with postgres_engine.connect() as pg_conn, pg_conn.begin():
        pg_shift_info = pd.read_sql_table('shift_info',
                                          pg_conn,
                                          index_col='id')
        with sqlite_engine.connect() as sl_conn, sl_conn.begin():
            shift_info_data = pd.read_sql("SELECT * FROM sessions",
                                          sl_conn)  #.squeeze()

        for _, sl_shift_info in shift_info_data.iterrows():
            pg_shift_info['date_str'] = pg_shift_info.open_time.dt.strftime(
                '%Y%m%d')
            sl_open_time = pd.to_datetime('%(date)s %(open_time)s' %
                                          sl_shift_info)
            sl_close_time = pd.to_datetime('%(date)s %(close_time)s' %
                                           sl_shift_info)

            # If it exists, replace the open and close times with the earliest and latest, respectively
            if (pg_shift_info.date_str == sl_open_time.strftime('%Y%m%d')
                ).any():
                id = pg_shift_info.loc[pg_shift_info.date_str == sl_open_time.
                                       strftime('%Y%m%d')].iloc[0].name
                pg_open_time = pg_shift_info.loc[id, 'open_time']
                pg_close_time = pg_shift_info.loc[id, 'close_time']
                open_time = min(pg_open_time, sl_open_time)
                close_time = max(pg_close_time, sl_close_time)
                sql = "UPDATE shift_info SET open_time = '%s', close_time = '%s' WHERE id=%s;" % (
                    open_time, close_time, id)
            else:
                sql = "INSERT INTO shift_info (open_time, close_time, shift_date) VALUES ('%s', '%s', '%s')" % \
                      (sl_open_time, sl_close_time, sl_open_time.strftime('%Y-%m-%d'))

        #with postgres_engine.connect() as conn, conn.begin():
        pg_conn.execute(sql)

        sys.stdout.write('Successfully imported from:')

        for csv_path in glob(os.path.join(data_dir, '*_checked.csv')):
            table_name = os.path.basename(csv_path).replace('_checked.csv', '')

            # Because access can't handle datetimes in the default SQL format, it had to be converted to something Access could handle. So now, make it a datetime again
            df = pd.read_csv(csv_path, parse_dates=['datetime'])

            # get sqlite dtypes and convert data back as necessary since Access annoyingly converts bools to
            #   integers
            with sqlite_engine.connect() as conn, conn.begin():
                sqlite_data = pd.read_sql_table(table_name, conn)

            df = clean_app_data(df, sqlite_data, table_name, postgres_engine)

            if len(df):
                #with postgres_engine.connect() as pg_conn, pg_conn.begin():
                postgres_columns = pd.read_sql("SELECT column_name FROM information_schema.columns "
                                               "WHERE table_name = '{}' AND table_schema = 'public';"
                                               .format(table_name), pg_conn) \
                    .squeeze()\
                    .tolist()

                df.drop([c for c in df if c not in postgres_columns],
                        axis=1,
                        inplace=True)

                df.to_sql(table_name, pg_conn, if_exists='append', index=False)
                sys.stdout.write('\n\t-%s' % table_name)

    # Update the imported column
    try:
        # Loop through each file and set it's 'imported' field. If only a single path was given, set data_files equal to
        #   a 1-item list of just the path
        if not data_files:
            data_files = [sqlite_path]

        for db_path in data_files.split(';'):
            sqlite_engine = create_engine("sqlite:///" + db_path)
            with sqlite_engine.connect() as conn, conn.begin():
                session_columns = pd.read_sql_table('sessions', conn).columns
            if 'imported' not in session_columns:
                sqlite_engine.execute(
                    "ALTER TABLE sessions ADD COLUMN imported INTEGER;")
            sqlite_engine.execute("UPDATE sessions SET imported = 1;")

    except:
        warnings.warn(
            "Failed to update 'imported' field in the data from the app. If you try to run this script again,"
            "it will not warn you that these data have already been uploaded.")

    # Copy the sqlite db to the archive
    if not os.path.isdir(archive_dir):
        try:
            os.mkdir(archive_dir)
            #shutil.copy(sqlite_path, archive_dir)
        except:
            pass
    if data_files:
        for path in data_files.split(';'):
            shutil.copy(path, archive_dir)
    else:
        shutil.copy(sqlite_path, archive_dir)

    # Clean up the text files and temporary dir created by validate_app_data.py
    try:
        shutil.rmtree(data_dir)
    except:
        pass

    # Try to delete the "combined_data.db"
    if os.path.basename(sqlite_path) == 'combined_data.db':
        try:
            os.remove(sqlite_path)
        except:
            pass
Ejemplo n.º 3
0
def main(connection_txt, years=None, out_dir=None, out_csv=None):

    if not (out_dir or out_csv):
        raise ValueError('Either a valid out_dir or out_csv must be given')

    # If none given, just used the current year
    if not years:
        years = [datetime.now().year]

    # If passed from the command line, it will by in the form 'year1, year2'
    elif ',' in years:
        years = [int(y.strip()) for y in years.split(',')]
    # or year_start-year_end
    elif '-' in years:
        year_start, year_end = [int(y.strip()) for y in years.split('-')]
        years = range(year_start, year_end + 1)
    elif len(years) == 4:
        years = [int(years)]
    else:
        raise ValueError(
            'years must be in the form "YYYY", "year1, year2, ...", or "year_start-year_end". Years given were %s'
            % years)

    if not out_csv:
        out_basename = 'gmp_vehicle_count_%s_%s.csv' % (
            years[0], years[-1]) if len(
                years) > 1 else 'gmp_vehicle_count_%s.csv' % years[0]
        out_csv = os.path.join(out_dir, out_basename)

    # Try to open the file to make sure it's not already open and therefore locked
    try:
        f = open(out_csv, 'w')
        f.close()
    except IOError as e:
        if e.errno == os.errno.EACCES:
            raise IOError(
                'Permission to access the output file %s was denied. This is likely because the file is currently open. Please close the file and re-run the script.'
                % out_csv)
        # Not a permission error.
        raise

    # read connection params from text. Need to keep them in a text file because password can't be stored in Github repo
    engine = query.connect_db(connection_txt)

    # Get field names that don't contain unique IDs
    field_names = query.query_field_names(engine)

    # Initiate the log file
    sys.stdout.write("Log file for %s: %s\n" %
                     (__file__, datetime.now().strftime('%H:%M:%S %m/%d/%Y')))
    sys.stdout.write('Command: python %s\n\n' %
                     subprocess.list2cmdline(sys.argv))
    sys.stdout.flush()

    yearly_data = []
    sql_statements = []
    for year in years:
        start_date = '%s-05-20 00:00:00' % year
        end_date = '%s-09-16 00:00:00' % year

        gmp_starts, gmp_ends = cvbt.get_gmp_dates(datetime(year, 5, 1),
                                                  datetime(year, 9, 16))
        btw_stmts = []
        for gmp_start, gmp_end in zip(gmp_starts, gmp_ends):
            btw_stmts.append(
                "(datetime::date BETWEEN '{start}' AND '{end}') ".format(
                    start=gmp_start.strftime('%Y-%m-%d'),
                    end=gmp_end.strftime('%Y-%m-%d')))
        gmp_date_clause = ' AND (%s) ' % ('OR '.join(btw_stmts))

        #gmp_date_clause, _, _ = cvbt.get_gmp_date_clause(datetime(year, 5, 1), datetime(year, 9, 16))
        date_range = cvbt.get_date_range(start_date,
                                         end_date,
                                         summarize_by='month')
        output_fields = cvbt.get_output_field_names(date_range, 'month')

        # Query buses
        bus_names = {
            'Transit':
            ['SHU', 'CMP', 'OTH', 'NUL', 'CHT', 'SPR', 'RSC', 'UNK'],  #
            'Long tour': ['KXP', 'EXC', 'TWT', 'WIW'],
            'Short tour': ['DNH'],
            'Educational buses': ['EDU']  ##,
        }
        other_criteria = "(is_training = ''false'') " + gmp_date_clause.replace(
            "'", "''")
        # All non-training buses except DNHTs. Do this separately so I can exclude buses going to Primrose
        bus_vehicles, sql = query.crosstab_query(
            engine,
            'buses',
            start_date,
            end_date,
            'bus_type',
            other_criteria=other_criteria + " AND destination <> ''PRM''",
            dissolve_names=bus_names,
            field_names=field_names['buses'],
            summarize_by='month',
            output_fields=output_fields,
            filter_fields=True,
            return_sql=True)
        sql_statements.append(sql)
        # Just non-training DNHTs
        with engine.connect() as conn, conn.begin():
            dissolve_names = {
                'Primrose buses':
                pd.read_sql("SELECT code FROM bus_codes",
                            conn).squeeze().tolist()
            }

        primrose_buses, sql = query.crosstab_query(
            engine,
            'buses',
            start_date,
            end_date,
            'bus_type',
            other_criteria=other_criteria + " AND destination = ''PRM''",
            dissolve_names=dissolve_names,
            field_names=field_names['buses'],
            summarize_by='month',
            output_fields=output_fields,
            filter_fields=True,
            return_sql=True)
        sql_statements.append(sql)
        bus_vehicles = bus_vehicles.append(primrose_buses)
        dissolve_names['Primrose buses pax'] = dissolve_names.pop(
            'Primrose buses')
        primrose_passengers, sql = query.crosstab_query(
            engine,
            'buses',
            start_date,
            end_date,
            'bus_type',
            other_criteria=other_criteria + " AND destination = ''PRM''",
            dissolve_names=dissolve_names,
            field_names=field_names['buses'],
            summarize_by='month',
            output_fields=output_fields,
            filter_fields=True,
            summary_field='n_passengers',
            return_sql=True)
        sql_statements.append(sql)
        bus_vehicles = bus_vehicles.append(primrose_passengers)

        # Rename lodge bus codes to use actual name
        bus_codes = query.get_lookup_table(engine, 'bus_codes')
        #del bus_codes['NUL'] # don't count buses without a type
        bus_vehicles.rename(index=bus_codes, inplace=True)

        # Query bus passengers
        bus_passengers, sql = query.crosstab_query(
            engine,
            'buses',
            start_date,
            end_date,
            'bus_type',
            other_criteria=other_criteria + " AND destination <> ''PRM''",
            dissolve_names=bus_names,
            field_names=field_names['buses'],
            summarize_by='month',
            output_fields=output_fields,
            filter_fields=True,
            summary_stat='SUM',
            summary_field='n_passengers',
            return_sql=True)
        sql_statements.append(sql)
        # Again, rename lodge bus codes to use actual names
        bus_passengers.rename(index=bus_codes, inplace=True)
        bus_passengers.index = [ind + ' pax' for ind in bus_passengers.index]

        # Query training buses
        trn_names = {
            'JV training':
            [item for k, v in bus_names.iteritems() for item in v] + ['TRN'],
            'Lodge bus training': ['CDN', 'KRH', 'DBL']
        }
        other_criteria = "(is_training OR bus_type=''TRN'') " + gmp_date_clause.replace(
            "'", "''")
        trn_buses, sql = query.crosstab_query(engine,
                                              'buses',
                                              start_date,
                                              end_date,
                                              'bus_type',
                                              other_criteria=other_criteria,
                                              dissolve_names=trn_names,
                                              field_names=field_names['buses'],
                                              summarize_by='month',
                                              output_fields=output_fields,
                                              return_sql=True)
        sql_statements.append(sql)

        # Query nps_approved
        #primrose_stmt = " AND destination <> 'PRM' "
        other_criteria = (gmp_date_clause).replace("'", "''")
        approved_vehicles, sql = query.crosstab_query(
            engine,
            'nps_approved',
            start_date,
            end_date,
            'approved_type',
            other_criteria=other_criteria,
            field_names=field_names['nps_approved'],
            summarize_by='month',
            output_fields=output_fields,
            return_sql=True,
            dissolve_names={'Other': ['OTH', 'NUL']})
        approved_codes = query.get_lookup_table(engine, 'nps_approved_codes')
        approved_vehicles.rename(index=approved_codes, inplace=True)
        sql_statements.append(sql)
        '''# Get concessionaire (i.e., JV) trips to Primrose separately because it's not included in the GMP count
        #other_criteria = "destination = ''PRM'' AND approved_type = ''CON'' "
        approved_vehicles_primrose, sql = query.crosstab_query(engine, 'nps_approved', start_date, end_date, 'approved_type',
                                                               other_criteria=gmp_date_clause.replace("'", "''"),
                                                               field_names=field_names['nps_approved'], summarize_by='month',
                                                               output_fields=output_fields, return_sql=True)
        sql_statements.append(sql)
        if len(approved_vehicles_primrose) > 0:
            import pdb; pdb.set_trace()
            approved_vehicles_primrose.index = ['JV (Primrose)']'''

        # Rename Nulls to other.
        approved_vehicles.rename(index={'Null': 'Other'}, inplace=True)
        approved_vehicles = approved_vehicles.groupby(
            by=approved_vehicles.index).sum()  #consilidate 2 'Other'

        # Query all other vehicle types with a regular GROUP BY query
        simple_counts = []
        other_criteria = (gmp_date_clause).lstrip('AND ')
        for table_name in SIMPLE_COUNT_QUERIES:
            counts, sql = query.simple_query(
                engine,
                table_name,
                field_names=field_names[table_name],
                other_criteria=other_criteria,
                summarize_by='month',
                output_fields=output_fields,
                return_sql=True)
            simple_counts.append(counts)
            sql_statements.append(sql)
        simple_counts = pd.concat(simple_counts, sort=False)

        # Get tek and accessibility passengers and number of cyclists
        accessibility_passengers, sql = query.simple_query(
            engine,
            'accessibility',
            field_names=field_names['accessibility'],
            other_criteria=other_criteria,
            summarize_by='month',
            output_fields=output_fields,
            summary_field='n_passengers',
            summary_stat='SUM',
            return_sql=True)
        sql_statements.append(sql)
        if len(accessibility_passengers) > 0:
            accessibility_passengers.index = [
                PRINT_NAMES['accessibility'] + ' pax'
            ]

        tek_passengers, sql = query.simple_query(
            engine,
            'tek_campers',
            field_names=field_names['tek_campers'],
            other_criteria=other_criteria,
            summarize_by='month',
            output_fields=output_fields,
            summary_field='n_passengers',
            summary_stat='SUM',
            return_sql=True)
        if len(tek_passengers) > 0:
            tek_passengers.index = [PRINT_NAMES['tek_campers'] + ' pax']
        sql_statements.append(sql)

        cyclists, sql = query.simple_query(engine,
                                           'cyclists',
                                           field_names=field_names['cyclists'],
                                           other_criteria=other_criteria,
                                           summarize_by='month',
                                           output_fields=output_fields,
                                           summary_field='n_passengers',
                                           summary_stat='SUM',
                                           return_sql=True)
        sql_statements.append(sql)
        if len(cyclists) > 0:
            cyclists.index = [PRINT_NAMES['cyclists']]

        all_data = pd.concat(
            [
                bus_vehicles,
                bus_passengers,
                trn_buses,
                approved_vehicles,
                #approved_vehicles_primrose,
                simple_counts,
                accessibility_passengers,
                tek_passengers,
                cyclists
            ],
            sort=False)

        all_data.columns = [
            datetime.strftime(datetime.strptime(c, '_%Y_%m'), '%b')
            if c != 'total' else c for c in all_data.columns
        ]

        # Make sure all rows have print-worthy names and set the order of rows and cols
        def replace(x, d):
            return d[x] if x in d else x

        all_data.index = all_data.index.map(lambda x: replace(x, PRINT_NAMES))

        if 'NUL' in all_data.index.tolist(): all_data.drop('NUL', inplace=True)
        all_data = all_data.reindex(
            index=SORT_ORDER,
            columns=['May', 'Jun', 'Jul', 'Aug', 'Sep', 'total']).fillna(0)

        # Set a multiindex for GMP stats (rows)
        gmp_rows = all_data.index[:-6]
        all_data.index = [['GMP'] * len(gmp_rows) + ['Non-GMP'] *
                          (len(all_data) - len(gmp_rows)), all_data.index]

        # Calculate totals
        pax_inds = [
            ind for ind in all_data.loc['GMP'].index.get_level_values(0)
            if 'pax' in ind
        ]
        vehicle_inds = [
            ind for ind in all_data.loc['GMP'].index.get_level_values(0)
            if 'pax' not in ind
        ]
        all_data.loc[('Totals', 'GMP vehicles'), :] = all_data.loc[(
            'GMP', vehicle_inds), :].sum(axis=0)
        all_data.loc[('Totals',
                      'GMP pax'), :] = all_data.loc[('GMP',
                                                     pax_inds), :].sum(axis=0)
        all_data.columns = [[year] * len(all_data.columns), all_data.columns]
        yearly_data.append(all_data)

    # Combine all years into one df and calculate % change if
    all_data = pd.concat(yearly_data, axis=1, sort=False)
    last_year = years[-1]
    for year in years[:-1]:
        all_data.loc[:, ('total_pct_change', 'from_%s' % year)] = \
            ((all_data.loc[:, (last_year, 'total')] - all_data.loc[:, (year, 'total')]) /
             all_data.loc[:, (year, 'total')] * 100) \
                .replace([np.inf, -np.inf], np.nan)\
                .fillna(0)\
                .round(1)
    #all_data = all_data.fillna(0)

    all_data.to_csv(out_csv)

    out_sql_txt = out_csv.replace('.csv', '_sql.txt')
    break_str = '#' * 100
    with open(out_sql_txt, 'w') as f:
        for stmt in sql_statements:
            f.write(stmt + '\n\n%s\n\n' % break_str)
        f.write('\n\n\n')

    # Write metadata
    descr = "This text file {out_csv} summarizes data from the Savage Check Station database by month and vehicle type for {start_year}-{end_year}. Vehicles that count toward the General Management Plan 10,512 vehicle limit (labeled 'GMP' in the first column of the text file) are tallied separately from those that do not. Veehicle lables are defined using the following types of vehicles (all labels ending with 'pax' represent a count of passengers for the corresponding vehicle label): \n"
    descr += "\n\t- Long tour: All Tundra Wilderness Tour, Kantishna Experience, and Eielson Excursion buses that were not training" \
             "\n\t- Short tour: All Denali Natural History Tours going further west than Primrose that were not training" \
             "\n\t- Transit: All Transit (formely 'Shuttle'), Camper, Other, Commerical charter, Spare, Rescue, Unknown, and Null bus types that were not training" \
             "\n\t- JV Training: All concessionaire buses marked as training" \
             "\n\t- CD-NF buses: All Camp Denali/North Face buses that were not training" \
             "\n\t- DBL buses: All Denali Backcountry Lodge buses that were not training" \
             "\n\t- KRH buses: All Kantishna Roadhouse buses that are were training" \
             "\n\t- Lodge bus training: All lodge buses marked as trianing" \
             "\n\t- Inholders: All private vehicles using a Right-of-Way special use road permit" \
             "\n\t- Subsistence: All private vehicles using a Subsistence special user road permit" \
             "\n\t- Tek campers: All private vehicles camping at the Teklanika Campground" \
             "\n\t- Prophos: All private vehicles using a professional photographer and commerical filming special use road permit" \
             "\n\t- Accessibility: All private vehicles using an accessibility special use road permit" \
             "\n\t- Researchers: All private vehicles using an 'NPS Approved' special use road permit for conducting research" \
             "\n\t- Education: All private vehicles using an 'NPS Approved' special use road permit for education purposes (typically the Murie Science and Learning Center)" \
             "\n\t- JV: All private vehicles using an 'NPS Approved' special use road permit assigned to the concessionaire" \
             "\n\t- NPS: All government vehicles" \
             "\n\t- Primrose buses: All JV buses going only to Primrose Rest Area that were not training" \
             "\n\t- Contractors: All private vehicles using a 'Contractor' special use road permit" \
             "\n\t- Cyclists: All visitors riding bicycles"

    descr += "\n\nOther files created:" \
             "\n\t- {out_sql_txt}: All Postgres SQL commands submitted to query the data"
    descr.format(out_csv=os.path.basename(out_csv),
                 start_year=years[0],
                 end_year=years[-1],
                 out_sql_txt=os.path.basename(out_sql_txt))

    command = 'python ' + subprocess.list2cmdline(sys.argv)
    datestamp = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
    msg = descr + \
          "\n\nFor questions, please contact Sam Hooper at [email protected]\n" \
          "\nSCRIPT: {script}" \
          "\nTIME PROCESSED: {datestamp}" \
          "\nCOMMAND: {command}"\
              .format(script=__file__,
                      datestamp=datestamp,
                      command=command)

    readme_path = out_csv.replace('.csv', '_README.txt')
    with open(readme_path, 'w') as readme:
        readme.write(msg)

    print '\nOutput file written to: %s' % out_csv
def get_normalized_daily_mean(query_end_datetime, connection_txt):
    ''' Return a dataframe of daily totals for query year and a series of mean daily totals for the last 5 years normalized to 10,512 (sum of daily totals == 10512)'''

    # We want to compare to the last 5 years of data so go 6 years back. The first 5 are for comparison and the most
    #  recent is the year of interest
    query_year = query_end_datetime.year
    exclude_years = EXCLUDE_ESTIMATION_YEARS[
        (EXCLUDE_ESTIMATION_YEARS >= (query_year - 6)) &
        (EXCLUDE_ESTIMATION_YEARS < query_year)
    ]

    # Make sure at least 5 years of old data are used
    start_datetime = datetime(query_year - (6 + len(exclude_years)), 5, 15)

    # Just get all the data at first, then filter out this year's data. That way we only have to query the DB once
    end_datetime = datetime(query_year, 9, 30)
    # If querying for a particular day in the season, make end_datetime the earlier of either the given date or Sep 30
    if query_end_datetime < end_datetime:
        end_datetime = query_end_datetime

    # Get start and end dates for each season
    gmp_starts, gmp_ends = count.get_gmp_dates(start_datetime, end_datetime)
    btw_stmts = []
    for gmp_start, gmp_end in zip(gmp_starts, gmp_ends):
        # Skip any years in the EXCLUDE series
        gmp_year = gmp_start.year
        if gmp_year in exclude_years.values and gmp_year != query_year:
            continue
        this_end_date = gmp_end if gmp_end < end_datetime else end_datetime
        btw_stmts.append("(datetime::date BETWEEN '{start}' AND '{end}') "
                         .format(start=gmp_start.strftime('%Y-%m-%d'),
                                 end=this_end_date.strftime('%Y-%m-%d'))
                         )
    gmp_date_criteria = ' AND (%s) ' % ('OR '.join(btw_stmts))
    start_datetime = max(start_datetime, gmp_starts.min().to_pydatetime())
    end_datetime = min(end_datetime, gmp_ends.max().to_pydatetime())
    start_date = start_datetime.strftime(count.DATETIME_FORMAT)
    end_date = end_datetime.strftime(count.DATETIME_FORMAT)

    # Get date range and names of output fields (formatted dates)
    date_range = count.get_date_range(start_date, end_date, summarize_by=SUMMARIZE_BY)
    date_range = date_range[~date_range.year.isin(exclude_years)]
    output_fields = count.get_output_field_names(date_range, SUMMARIZE_BY, gmp_dates=[gmp_starts, gmp_ends])

    engine = query.connect_db(connection_txt)

    x_labels = count.get_x_labels(pd.to_datetime(output_fields.index), SUMMARIZE_BY)
    x_labels.index = output_fields
    field_names = query.query_field_names(engine)

    # Query database to get a count of all GMP vehicles by day
    data, _ = count.query_total(output_fields, field_names, output_fields.index[0], end_date, date_range, SUMMARIZE_BY, engine, other_criteria=gmp_date_criteria, use_gmp_vehicles=True)

    # data is returned as a 1-row df where each column is a different day, so make it a series
    data = data.squeeze()

    # Make it a dataframe again with a column for date
    data = pd.DataFrame({'datetime': pd.to_datetime(data.index, format=count.FORMAT_STRS[SUMMARIZE_BY]), 'daily_total': data})

    # For each year, record the day of the season. This will be used to align the days of different years. The reason
    #   for using day of season instead of day of year is because there is a weekly pattern to vehicle counts and the
    #   start day of the GMP regulatory period is always a Saturday. Using the day of the season will always align days
    #   by day of the week
    dfs = []
    for year, df in data.groupby(data.datetime.dt.year):
        df = df.sort_index()
        min_datetime = df.datetime.min()
        df['day_of_season'] = (df.datetime - min_datetime).dt.days
        df['year'] = year

        # Split the data into the first 5 years and last year. Calculate the daily total normalized by
        #   the 10512/(total for this year). This will make each year add up to 10512, so that when we take the average
        #   by day of season, we get some typical pattern of daily values if every year met the 10512 value exactly
        if year < query_year:
            df['normalized_total'] = df.daily_total * GMP_LIMIT/df.daily_total.sum()
            dfs.append(df)
        else:
            current_data = df.set_index('day_of_season')

    previous_data = pd.concat(dfs)
    grouped = previous_data.groupby('day_of_season').normalized_total
    normalized_data = pd.DataFrame({'nmean': grouped.mean(), 'nmax': grouped.max(), 'nmin': grouped.min()})

    return current_data, normalized_data
Ejemplo n.º 5
0
def main(sqlite_paths_str, connection_txt, output_dir=None):

    sys.stdout.write("Log file for %s: %s\n" %
                     (__file__, datetime.now().strftime('%H:%M:%S %m/%d/%Y')))
    sys.stdout.write('Command: python %s\n\n' %
                     subprocess.list2cmdline(sys.argv))
    sys.stdout.flush()

    postgres_engine = connect_db(connection_txt)
    sqlite_path, component_paths = combine_sqlite_dbs(sqlite_paths_str,
                                                      postgres_engine)

    if not len(component_paths):
        raise RuntimeError('All data files are empty')

    sys.stdout.write('sqlite_paths: %s\n\n' % component_paths)
    sys.stdout.flush()

    sqlite_engine = create_engine("sqlite:///" + sqlite_path)
    # Get list of all tables in the master DB
    with postgres_engine.connect() as pg_conn, pg_conn.begin():
        postgres_tables = pd.read_sql("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';",
                                      pg_conn)\
                                      .squeeze()\
                                      .tolist()
        pg_shift_info = pd.read_sql_table('shift_info',
                                          pg_conn,
                                          index_col='id')

    # Get data from app
    with sqlite_engine.connect() as sl_conn, sl_conn.begin():
        sqlite_tables = pd.read_sql("SELECT name FROM sqlite_master WHERE name NOT LIKE('sqlite%') AND name NOT "
                                    "LIKE('sessions');",
                                    sl_conn)\
                                    .squeeze()
        data = {
            table_name: pd.read_sql("SELECT * FROM %s" % table_name,
                                    sl_conn,
                                    index_col='id')
            for table_name in sqlite_tables
        }
        sl_shift_info = pd.read_sql("SELECT * FROM sessions",
                                    sl_conn)  #.squeeze()

    if 'imported' in sl_shift_info.columns:  #index:
        for _, this_shift_info in sl_shift_info.loc[~sl_shift_info.imported.
                                                    isnull()].iterrows():
            if this_shift_info.imported:
                raise RuntimeError(
                    "These data have already been uploaded: %s" %
                    this_shift_info.filename)

    # Make temp dir and set up vars for looping through tables
    output_dir = os.path.join(os.path.dirname(sqlite_path),
                              '_temp') if not output_dir else output_dir
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    subprocess.call(["attrib", "+H", output_dir])

    destination_values = get_lookup_table(postgres_engine,
                                          'destination_codes').values

    missing_lookup_dfs = []
    for table_name, df in data.iteritems():

        if table_name == 'observations':
            continue

        # Check that the table exists in the master DB. If not, skip it.
        if table_name not in postgres_tables:
            warnings.warn(
                'Table named "{table_name}" not found in database. Database tables are: \n\t\t{pg_tables}'
                .format(table_name=table_name,
                        pg_tables='\n\t\t'.join(postgres_tables)))
            continue

        # If there's no data in this table, write the empty dataframe and continue
        flagged_path = os.path.join(output_dir, '%s_flagged.csv' % table_name)
        if not len(df):
            df.to_csv(flagged_path, index=False, encoding='utf-8')
            continue

        # Clean up unicode strings so sqlalchemy doesn't freak out when importing
        df.loc[:, df.dtypes == object] = df.loc[:, df.dtypes == object]\
            .applymap(lambda x: x if x == None else str(unicodedata.normalize('NFKD', x)))

        # Combine date and time columns
        df['datetime'] = pd.to_datetime(
            df.date + ' ' +
            df.time)  # format should be automatically undersood
        df.drop(['date', 'time'], axis=1, inplace=True)

        # Check for duplicates within the DB from the app
        duplicate_columns = [
            c for c in DUPLICATE_FIELDS_ALL + DUPLICATE_FIELDS_TBL[table_name]
            if c in df.columns
        ]
        sl_duplicates = df.loc[df.duplicated(
            subset=duplicate_columns,
            keep=False)].copy()  # keep=false keeps all dups
        sl_duplicates['duplicated_in_app'] = True

        # Check for duplicates with the Postgres db. Limit the check to only Postgres records from this year to
        #   reduce read times
        numeric_fields = pd.Series([
            f for f in get_numeric_pg_fields(postgres_engine, table_name)
            if f in df.columns
        ])
        if hasattr(numeric_fields, '__iter__'):
            numeric_fields = numeric_fields[numeric_fields.isin(
                duplicate_columns)]  # & numeric_fields.isin(df.columns)]
        with postgres_engine.connect() as pg_conn, pg_conn.begin():
            pg_data = pd.read_sql(
                "SELECT * FROM {table_name} WHERE extract(year FROM datetime) = {year}"
                .format(table_name=table_name,
                        year=datetime.now().year), pg_conn)

        # If there are no data in the DB for this table, the pd.merge() line will balk because the column dtypes to
        #   merge on won't match. So check, and create an empty dataframe if true
        if len(pg_data):
            cleaned_data = clean_app_data(df, df, table_name, postgres_engine)

            # Get all indices from all rows in df whose duplicate columns match those in the master DB.
            cleaned_data['id_'] = cleaned_data.index
            merged = pd.merge(fill_null(cleaned_data, numeric_fields),
                              fill_null(pg_data, numeric_fields),
                              on=duplicate_columns,
                              how='left',
                              indicator='exists')
            is_pg_duplicate = (
                merged.drop_duplicates('id_').exists == 'both'
            ).values  # pd.merge creates a new index so just get array of bool values
            cleaned_data['found_in_db'] = is_pg_duplicate
            pg_duplicates = cleaned_data.loc[cleaned_data.found_in_db]
        else:
            # Still need the found_in_db column though because
            pg_duplicates = pd.DataFrame()

        duplicates = pd.concat([sl_duplicates, pg_duplicates], sort=False)

        # In case any records were duplicated in the app and the DB, reduce the df by the index. max() will return
        #   True/False if one of the repeated indices is NaN but another is True/False. All other columns should
        #   be identical since a duplicated index represents the same record from the sqlite DB
        duplicates = duplicates.groupby(duplicates.index)\
            .max()\
            .fillna(False)

        df['duplicated_in_app'] = False
        df['found_in_db'] = False
        if len(sl_duplicates):
            df.loc[duplicates.index,
                   'duplicated_in_app'] = duplicates.duplicated_in_app
        if len(pg_duplicates):
            df.loc[duplicates.index, 'found_in_db'] = duplicates.found_in_db

        # If this table contains any lookup values, check to see if all data values exist in the corresponding
        #  lookup table
        if 'destination' in df.columns:
            destination_lookup_params = pd.Series({
                'data_table': table_name,
                'lookup_table': 'destination_codes',
                'lookup_index': 'code',
                'lookup_value': 'name'
            })
            missing_info = get_missing_lookup(df, table_name, 'destination',
                                              postgres_engine,
                                              destination_lookup_params)
            if len(missing_info) > 0:
                missing_lookup_dfs.append(missing_info)

        if table_name in LOOKUP_FIELDS.index:
            for data_field, lookup_params in LOOKUP_FIELDS.loc[
                    table_name].iterrows():
                missing_info = get_missing_lookup(df, table_name, data_field,
                                                  postgres_engine,
                                                  lookup_params)
                if len(missing_info) > 0:
                    missing_lookup_dfs.append(missing_info)

        # Access expects datetimes in the format mm/dd/yy hh:mm:ss so reformat it
        df.datetime = df.datetime.dt.strftime('%m/%d/%Y %H:%M:%S')

        # This is possibly one of the dumbest things I've ever had to do in code, but Access doesn't handle columns
        #  with mixed data types well -- it will sometimes assume that a column containing both integers and text as
        #  integer, meaning the text rows will fail to import. To force Access to read all of it as text, make the
        #  first 50 rows all nonsense text. These rows will then be deleted as soon as they're imported.
        df = pd.concat([
            pd.DataFrame(np.full((50, len(df.columns)), 'aaaa'),
                         columns=df.columns), df
        ])

        df.to_csv(flagged_path, index=False, encoding='utf-8')

    # If there were any missing lookup values, save the CSV
    if len(missing_lookup_dfs) > 0:
        missing_lookup = pd.concat(missing_lookup_dfs)
        missing_lookup.to_csv(os.path.join(
            output_dir, 'missing_lookup_values_flagged.csv'),
                              index=False)
Ejemplo n.º 6
0
def main(connection_txt, out_dir, json_path=None):

    sys.stdout.write("Log file for %s\n%s\n\n" %
                     (__file__, datetime.now().strftime('%H:%M:%S %m/%d/%Y')))
    sys.stdout.write('Command: python %s\n\n' %
                     subprocess.list2cmdline(sys.argv))
    sys.stdout.flush()

    if json_path:
        json_dropdown_options, json_field_properties, json_data = parse_json_data(
            json_path)

    # Get lookup values from the DB
    dropdown_options = []
    postgres_engine = connect_db(connection_txt)
    for _, table_info in FIELD_PROPERTIES.iterrows():
        if table_info.validation_table:
            sql = "SELECT DISTINCT {validation_field} FROM {validation_table};".format(
                **table_info)
            if table_info.config_column == 'Bus type':
                sql = sql.replace(';', " WHERE NOT is_lodge_bus;")
            elif table_info.config_column == 'Lodge':
                sql = sql.replace(';', " WHERE is_lodge_bus;")
            #elif table_info.config_column == 'Inholder':
            #max_year = pd.read_sql("SELECT replace(column_name, '_', '') FROM information_schema.columns WHERE table_name = 'inholder_allotments' AND left(column_name, 1) = '_';", conn).squeeze().max()
            #sql = sql.replace(';', " WHERE _{year} <> 0 AND _{year} IS NOT NULL;".format(year=max_year))
            elif table_info.config_column == 'Destination':
                sql = "SELECT name FROM (SELECT * FROM destination_codes ORDER BY mile) AS foo;"
            with postgres_engine.connect() as conn, conn.begin():
                db_values = pd.read_sql(sql, conn).squeeze()
            db_values = db_values[db_values != 'Null']

            # if a JSON config file was given, append new values from the DB to the existing values
            if json_path:
                json_values = json_dropdown_options[
                    table_info.config_column].dropna()
                missing = json_values.loc[~json_values.isin(db_values)]
                db_values = missing.append(db_values)
                if json_field_properties.loc['sorted',
                                             table_info.config_column]:
                    db_values = db_values.sort_values()

            dropdown_options.append(
                pd.DataFrame({table_info.config_column: db_values.tolist()}))

        else:
            values = []
            if json_path:
                values += json_dropdown_options[
                    table_info.config_column].tolist()
            dropdown_options.append(
                pd.DataFrame({table_info.config_column: values}))

    # Concatenate all of the options into a single dataframe
    field_options = pd.concat(dropdown_options, axis=1,
                              sort=False).reindex(columns=COLUMN_ORDER)

    # If json_path was given, use the field_properties from the JSON config file
    if json_path:
        field_properties = json_field_properties.copy()
    # Otherwise, reformat the FIELD_PROPERTIES df
    else:
        field_properties = FIELD_PROPERTIES.set_index(
            'config_column').T.reindex(columns=COLUMN_ORDER)
        field_properties.loc['sorted'] = False
        field_properties.index.name = 'attribute'

    # Create the missing values CSV because the VBA code will expect it even though there won't be any missing values
    missing_values = pd.DataFrame(columns=[
        'data_value', 'data_table', 'data_field', 'lookup_table',
        'lookup_field'
    ])

    try:
        if os.path.isdir(out_dir):
            out_dir = os.path.join(out_dir, '_temp')
            if not os.path.isdir(out_dir):
                os.mkdir(out_dir)
        else:
            out_dir = os.path.join(out_dir, '_temp')
            os.makedirs(out_dir)
    except Exception as e:
        raise IOError('Could not create output directory at %s because %s' %
                      (out_dir, e.message))
    subprocess.call(["attrib", "+H", out_dir])  # Make sure it's hidden

    field_options.to_csv(os.path.join(out_dir,
                                      'json_config_dropdown_options.csv'),
                         index=False)
    field_properties.to_csv(
        os.path.join(out_dir, 'json_config_field_properties.csv'))
    missing_values.to_csv(os.path.join(out_dir,
                                       'json_config_missing_values.csv'),
                          index=False)
    if json_path:
        global_properties = {
            k: v
            for k, v in json_data.iteritems()
            if type(v) == str or type(v) == unicode or type(v) == bool
        }
        if len(global_properties):
            pd.DataFrame(global_properties, index=[0])\
                .to_csv(os.path.join(out_dir, 'json_config_global_properties.csv'), index=False)

    print 'Parsed data written to %s' % out_dir