Ejemplos de create_table en Python

Lenguaje de programación: Python

Namespace/Package Name: shared_tables

Método / Función: create_table

Ejemplos en hotexamples.com: 5

Python create_table - 5 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de shared_tables.create_table extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: tracking_pv.py Proyecto: switch-model/switch-hawaii-studies

def distributed_pv():
    # for now, just reuse old data

    # store data in postgresql tables
    shared_tables.create_table("project")
    shared_tables.create_table("cap_factor")

    # remove old records (best before removing indexes)
    execute("""
        DELETE FROM cap_factor WHERE project_id IN (SELECT project_id FROM project WHERE technology = 'DistPV');
    """)
    execute("""
        DELETE FROM project WHERE technology = 'DistPV';
    """)

    # remove indexes
    shared_tables.drop_indexes("cap_factor")    # drop and recreate is faster than incremental sorting

    execute("""
        INSERT INTO project (load_zone, technology, site, orientation, max_capacity)
        SELECT load_zone, technology, 'DistPV' AS site, orientation, max_capacity
            FROM max_capacity_pre_2016_06_21
            WHERE technology = 'DistPV';
    """)
    execute("""
        INSERT INTO cap_factor (project_id, date_time, cap_factor)
            SELECT project_id, date_time, cap_factor
                FROM cap_factor_pre_2016_06_21 cf JOIN project USING (load_zone, technology, orientation)
                WHERE cf.technology = 'DistPV';
    """)

    # restore indexes
    shared_tables.create_indexes("cap_factor")

Ejemplo n.º 2

Mostrar archivo

Archivo: import_data.py Proyecto: fagan2888/data-8

def generator_info():
    # note: for now, these must always be run in this sequence, because
    # new_generator_info() creates the generator_info and part_load_fuel_consumption
    # tables, and then existing_generator_info() appends to them (without clearing
    # out any prior records)
    shared_tables.create_table('project')
    shared_tables.create_table('generator_info')
    new_generator_info()
    existing_generator_info()

Ejemplo n.º 3

Mostrar archivo

Archivo: tracking_pv.py Proyecto: switch-model/switch-hawaii-studies

def tracking_pv():
    # make a list of all available NSRDB data files
    nsrdb_file_dict, years = get_nsrdb_file_dict()
    cluster_cell = pd.DataFrame.from_csv(
        db_path('GIS/Utility-Scale Solar Sites/solar_cluster_nsrdb_grid_renamed.csv'),
        index_col='gridclstid'
    )
    cluster_cell = cluster_cell[cluster_cell['solar_covg']>0]
    cell = cluster_cell.groupby('nsrdb_id')
    cluster = cluster_cell.groupby('cluster_id')
    cluster_total_solar_area = cluster['solar_area'].sum()

    cluster_ids = cluster.groups.keys()             # list of all cluster ids, for convenience
    cluster_id_digits = len(str(max(cluster_ids)))  # max number of digits for a cluster id
    # site_ids for each cluster_id (these distinguish PV sites from wind sites that may have the same number)
    site_ids = ['PV_' + str(cluster_id).zfill(cluster_id_digits) for cluster_id in cluster_ids]

    # calculate weighted average lat and lon for each cluster
    # (note: the axis=0 and axis=1 keep pandas from generating lots of nans due to
    # trying to match column name in addition to row index)
    cluster_coords = pd.concat([
        cluster_cell['cluster_id'],
        cluster_cell[['solar_lat', 'solar_lon']].multiply(cluster_cell['solar_area'], axis=0)
    ], axis=1).groupby('cluster_id').sum().div(cluster_total_solar_area, axis=0)
    cluster_coords.columns=['latitude', 'longitude']

    # get list of technologies to be defined
    technologies = central_solar_techs.index.values

    # calculate capacity factors for all projects

    # This dict will hold vectors of capacity factors for each cluster for each year and technology.
    # This arrangement is simpler than using a DataFrame because we don't yet know the
    # indexes (timesteps) of the data for each year.
    cluster_cap_factors = dict()
    for tech in technologies:
        # go through all the needed nsrdb cells and add them to the capacity factor for the
        # relevant cluster and year
        for cell_id, grp in cell:
            # grp has one row for each cluster that uses data from this cell
            lat = round_coord(grp['nsrdb_lat'].iloc[0])
            lon = round_coord(grp['nsrdb_lon'].iloc[0])
            for year in years:
                cap_factors = get_cap_factors(nsrdb_file_dict[lat, lon, year], central_solar_techs.loc[tech, 'array_type'])
                # note: iterrows() would convert everything to a single (float) series, but itertuples doesn't
                for clst in grp.itertuples():
                    contrib = cap_factors * clst.solar_area / cluster_total_solar_area[clst.cluster_id]
                    key = (tech, clst.cluster_id, year)
                    if key in cluster_cap_factors:
                        cluster_cap_factors[key] += contrib
                    else:
                        cluster_cap_factors[key] = contrib

    # get timesteps for each year (based on lat and lon of first cell in the list)
    timesteps = dict()
    lat = round_coord(cluster_cell['nsrdb_lat'].iloc[0])
    lon = round_coord(cluster_cell['nsrdb_lon'].iloc[0])
    for year in years:
        timesteps[year] = get_timesteps(nsrdb_file_dict[(lat, lon, year)])

    # make an index of all timesteps
    timestep_index = pd.concat([pd.DataFrame(index=x) for x in timesteps.values()]).index.sort_values()

    # make a single dataframe to hold all the data
    cap_factor_df = pd.DataFrame(
        index=timestep_index,
        columns=pd.MultiIndex.from_product([technologies, site_ids]),
        dtype=float
    )

    # assign values to the dataframe
    for ((tech, cluster_id, year), cap_factors) in cluster_cap_factors.iteritems():
        cap_factor_df.update(pd.DataFrame(
                cap_factors,
                index=timesteps[year],
                columns=[(tech, 'PV_' + str(cluster_id).zfill(cluster_id_digits))]
        ))
    cap_factor_df.columns.names = ['technology', 'site']
    cap_factor_df.index.names=['date_time']

    # add load_zone and orientation to the index
    cap_factor_df['load_zone'] = load_zone
    cap_factor_df['orientation'] = 'na'
    cap_factor_df.set_index(['load_zone', 'orientation'], append=True, inplace=True)
    # convert to database orientation, with natural order for indexes,
    # but also keep as a DataFrame
    cap_factor_df = pd.DataFrame(
        {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)}
    )
    # sort table, then switch to using z, t, s, o as index (to match with project table)
    cap_factor_df = cap_factor_df.reorder_levels(
        ['load_zone', 'technology', 'site', 'orientation', 'date_time']
    ).sort_index().reset_index('date_time')

    # make a dataframe showing potential projects (same structure as "project" table)
    # note: for now we don't really handle multiple load zones and we don't worry about orientation
    # (may eventually have projects available with different azimuth and slope)
    # This concatenates a list of DataFrames, one for each technology
    project_df = pd.concat([
        pd.DataFrame(dict(
            load_zone=load_zone,
            technology=tech,
            site=site_ids,
            orientation='na',
            max_capacity=cluster_total_solar_area*central_solar_techs.loc[tech, 'mw_per_m2'],
            latitude=cluster_coords['latitude'],
            longitude=cluster_coords['longitude'],
        ))
        for tech in technologies
    ], axis=0).set_index(['load_zone', 'technology', 'site', 'orientation'])


    # store data in postgresql tables
    shared_tables.create_table("project")
    execute("DELETE FROM project WHERE technology IN %s;", [tuple(technologies)])
    project_df.to_sql('project', db_engine, if_exists='append')

    # retrieve the project IDs (created automatically in the database)
    project_ids = pd.read_sql(
        "SELECT project_id, load_zone, technology, site, orientation "
        + "FROM project WHERE technology IN %(techs)s;",
        db_engine, index_col=['load_zone', 'technology', 'site', 'orientation'],
        params={'techs': tuple(technologies)}
    )
    cap_factor_df['project_id'] = project_ids['project_id']

    # convert date_time values into strings for insertion into postgresql.
    # Inserting a timezone-aware DatetimeIndex into postgresql fails; see
    # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061
    # note: the string conversion is pretty slow
    cap_factor_df['date_time'] = pd.DatetimeIndex(cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z")

    cap_factor_df.set_index(['project_id', 'date_time'], inplace=True)
    # Do we need error checking here? If any projects aren't in cap_factor_df, they'll
    # create single rows with NaNs (and any prior existing cap_factors for them will
    # get dropped below).
    # If any rows in cap_factor_df aren't matched to a project, they'll go in with
    # a null project_id.

    shared_tables.create_table("cap_factor")    # only created if it doesn't exist
    shared_tables.drop_indexes("cap_factor")    # drop and recreate is faster than incremental sorting
    execute("DELETE FROM cap_factor WHERE project_id IN %s;", [tuple(project_ids['project_id'])])
    cap_factor_df.to_sql('cap_factor', db_engine, if_exists='append', chunksize=10000)
    shared_tables.create_indexes("cap_factor")

Ejemplo n.º 4

Mostrar archivo

def distributed_pv():
    # TODO: break up the major sub-sections of the main loop into separate functions
    # TODO: merge the code that gets capacity factors for each configuration here
    # with the code that gets capacity factors for each cell for utility-scale PV.
    # TODO: write a general function that adds a block of projects and capacity
    # factors to the postgresql database (including reading back the project IDs),
    # and use that for distributed PV, utility-scale PV and wind projects

    # read roof areas from load_zone_grid_cell.csv
    # treat any NaNs (missing data) as 0.0 coverage
    all_cells = pd.DataFrame.from_csv(
        db_path('GIS/General/load_zone_nsrdb_cell.csv'),
        index_col=('load_zone', 'nsrdb_id')).fillna(0.0)

    # make sure tables exist, and clear out existing DistPV data;
    # the loops below will add records back to this table, one load zone at a time
    shared_tables.create_table("project")
    shared_tables.create_table("cap_factor")
    shared_tables.drop_indexes(
        "cap_factor")  # drop and recreate is faster than incremental sorting
    execute("""
        DELETE FROM cap_factor c 
            USING project p 
            WHERE c.project_id=p.project_id AND p.technology='DistPV';
        DELETE FROM project
            WHERE technology='DistPV';
        DROP TABLE IF EXISTS dist_pv_details;")
    """)

    # calculate hourly capacity factor for all dist pv configurations
    # for each cell in each load zone
    for lz in load_zones:
        lz_cells = all_cells.loc[lz, :]
        lz_cells = lz_cells[lz_cells.roof_area > 0.0]
        # create an array to hold hourly capacity factors for all cells in this load zone
        # it will end up with one row for each cell and one column for each hour
        cap_factors = None
        for cell_n, cell in enumerate(lz_cells.itertuples()):
            cell_capacities, cell_cap_factors = get_dist_pv_cap_factors(
                cell.nsrdb_lat, cell.nsrdb_lon, cell.roof_area)
            # note: this is the first time when we know how many configurations
            # and timesteps there are, so this is when we create the cap_factors array
            if cap_factors is None:
                capacities = np.empty((len(lz_cells), ) +
                                      cell_capacities.shape)
                cap_factors = np.empty((len(lz_cells), ) +
                                       cell_cap_factors.shape)
                # fill them with nans, so we'll see if any aren't filled later
                capacities.fill(np.nan)
                cap_factors.fill(np.nan)
            capacities[cell_n, :] = cell_capacities
            cap_factors[cell_n, :, :] = cell_cap_factors

        # reshape into a long list of resources instead of a cell x config matrix
        capacities = capacities.reshape((-1, ))
        cap_factors = cap_factors.reshape((-1, cap_factors.shape[2]))

        # cluster available resources into 20 tranches with similar timing and quality
        # (we assume the better-suited ones will be developed before the worse ones)
        # (This could be sped up by using a subsample of the timesteps if needed, but then
        # the cluster means would have to be calculated afterwards.)
        # an alternative approach would be to cluster resources based on annual average
        # capacity factor, but that neglects differences in timing between different
        # orientations.
        km = KMeans(20, X=cap_factors, size=capacities)
        import time
        start = time.time()
        km.init_centers()  # 3 mins
        print("init_centers(): {} s".format(time.time() - start))
        start = time.time()
        km.find_centers()  # 9 mins
        print("find_centers(): {} s".format(time.time() - start))
        # now km.mu is a matrix of capacity factors, with one row per cluster
        # and one column per timestep
        # and km.cluster_id shows which cluster each resource belongs to

        cluster_capacities = np.bincount(km.cluster_id, weights=capacities)
        cluster_cap_factors = km.mu.T

        # PROJECT TABLE

        # store project definitions and capacity factors
        project_df = pd.DataFrame.from_items([
            ('load_zone', load_zone), ('technology', 'DistPV'),
            ('site',
             ['Oahu_DistPV_' + str(i)
              for i in range(len(cluster_capacities))]), ('orientation', 'na'),
            ('max_capacity', cluster_capacities), ('connect_cost_per_mw', 0.0)
        ]).set_index(['load_zone', 'technology', 'site', 'orientation'])
        project_df.to_sql('project', db_engine, if_exists='append')

        # CAP_FACTOR TABLE

        # get timesteps for each year (based on lat and lon of last cell in the list)
        timesteps = [
            get_timesteps(nsrdb_file_dict[(cell.nsrdb_lat, cell.nsrdb_lon,
                                           year)]) for year in years
        ]
        # make an index of all timesteps
        timestep_index = pd.concat(
            (pd.DataFrame(index=x) for x in timesteps)).index.sort_values()

        # make an index of all site_ids
        # TODO: change this code and project_df code to zero-fill site numbers up to 2 digits
        # (enough to cover the number of tranches in each zone)
        site_ids = [
            load_zone + '_DistPV_' + str(i)
            for i in range(cluster_cap_factors.shape[1])
        ]

        # multiindex of load_zone, technology, site, orientation
        proj_index = pd.MultiIndex.from_product([[load_zone], ['DistPV'],
                                                 site_ids, ['na']])

        # make a single dataframe to hold all the data
        cap_factor_df = pd.DataFrame(
            cluster_cap_factors,
            index=timestep_index,
            columns=proj_index,
        )
        cap_factor_df.columns.names = [
            'load_zone', 'technology', 'site', 'orientation'
        ]
        cap_factor_df.index.names = ['date_time']

        # convert to database orientation, with natural order for indexes,
        # but also keep as a DataFrame
        cap_factor_df = pd.DataFrame(
            {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)})
        # sort table, then switch to using z, t, s, o as index (to match with project table)
        # (takes a few minutes)
        cap_factor_df = cap_factor_df.reorder_levels(
            ['load_zone', 'technology', 'site', 'orientation',
             'date_time']).sort_index().reset_index('date_time')

        # retrieve the project IDs (created automatically in the database earlier)
        # note: this read-back could potentially be done earlier, and then the
        # project ids could be included in cap_factor_df when it is first created.
        # but this provides cross-referencing by z, t, s, o automatically, which is helpful.
        project_ids = pd.read_sql(
            "SELECT project_id, load_zone, technology, site, orientation " +
            "FROM project WHERE technology = 'DistPV';",
            db_engine,
            index_col=['load_zone', 'technology', 'site', 'orientation'])
        cap_factor_df['project_id'] = project_ids['project_id']

        # convert date_time values into strings for insertion into postgresql.
        # Inserting a timezone-aware DatetimeIndex into postgresql fails; see
        # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061
        # note: the string conversion is pretty slow
        cap_factor_df['date_time'] = pd.DatetimeIndex(
            cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z")

        cap_factor_df.set_index(['project_id', 'date_time'], inplace=True)
        # Do we need error checking here? If any projects aren't in cap_factor_df, they'll
        # create single rows with NaNs (and any prior existing cap_factors for them will
        # get dropped below).
        # If any rows in cap_factor_df aren't matched to a project, they'll go in with
        # a null project_id.

        # The next line is very slow. But it only seems possible to speed it up by
        # copying the data to a csv and doing a bulk insert, which is more cumbersome.
        # progress can be monitored via this command in psql:
        # select query from pg_stat_activity where query like 'INSERT%';
        cap_factor_df.to_sql('cap_factor',
                             db_engine,
                             if_exists='append',
                             chunksize=10000)

        # DIST_PV_DETAILS TABLE

        # store cluster details for later reference
        # would be interesting to see mean and stdev of lat, lon,
        # cap factor, azimuth, tilt for each cluster, so we can describe them.
        dist_pv_details = pd.Panel(
            {
                'capacity_mw':
                capacities.reshape((len(lz_cells), -1)),
                'site': ('Oahu_DistPV_' +
                         km.cluster_id.astype(str).astype(np.object)).reshape(
                             (len(lz_cells), -1))
            },
            major_axis=[lz_cells[col] for col in ['nsrdb_lat', 'nsrdb_lon']],
            minor_axis=[
                dist_pv_configs[col] for col in dist_pv_configs.columns
            ]).to_frame().reset_index()
        dist_pv_details.insert(0, 'load_zone', load_zone)

        # store in postgresql database
        dist_pv_details.to_sql('dist_pv_details',
                               db_engine,
                               if_exists='append')

    # restore indexes, final cleanup
    shared_tables.create_indexes("cap_factor")
    execute("ALTER TABLE dist_pv_details OWNER TO admin;")

Ejemplo n.º 5

Mostrar archivo

def tracking_pv():
    # note: this uses global nsrdb_file_dict and years variables

    cluster_cell = pd.DataFrame.from_csv(db_path(
        'GIS/Utility-Scale Solar Sites/solar_cluster_nsrdb_grid_final.csv'),
                                         index_col='gridclstid')
    cluster_cell = cluster_cell[cluster_cell['solar_covg'] > 0]
    cell = cluster_cell.groupby('nsrdb_id')
    cluster = cluster_cell.groupby('cluster_id')
    cluster_total_solar_area = cluster['solar_area'].sum()

    cluster_ids = cluster.groups.keys(
    )  # list of all cluster ids, for convenience
    cluster_id_digits = len(str(
        max(cluster_ids)))  # max number of digits for a cluster id
    # site_ids for each cluster_id (these distinguish PV sites from wind sites that may have the same number)
    site_ids = [
        'PV_' + str(cluster_id).zfill(cluster_id_digits)
        for cluster_id in cluster_ids
    ]

    # calculate weighted average lat and lon for each cluster
    # (note: the axis=0 and axis=1 keep pandas from generating lots of nans due to
    # trying to match column name in addition to row index)
    cluster_coords = pd.concat(
        [
            cluster_cell['cluster_id'], cluster_cell[[
                'solar_lat', 'solar_lon'
            ]].multiply(cluster_cell['solar_area'], axis=0)
        ],
        axis=1).groupby('cluster_id').sum().div(cluster_total_solar_area,
                                                axis=0)
    cluster_coords.columns = ['latitude', 'longitude']

    # get list of technologies to be defined
    technologies = central_solar_techs.index.values

    # calculate capacity factors for all projects

    # This dict will hold vectors of capacity factors for each cluster for each year and technology.
    # This arrangement is simpler than using a DataFrame because we don't yet know the
    # indexes (timesteps) of the data for each year.
    cluster_cap_factors = dict()
    for tech in technologies:
        # go through all the needed nsrdb cells and add them to the capacity factor for the
        # relevant cluster and year
        for cell_id, grp in cell:
            # grp has one row for each cluster that uses data from this cell
            lat = round_coord(grp['nsrdb_lat'].iloc[0])
            lon = round_coord(grp['nsrdb_lon'].iloc[0])
            array_type = central_solar_techs.loc[tech, 'array_type']
            gcr = central_solar_techs.loc[tech, 'gcr']
            azimuth = 180  # south-facing
            if array_type == 0:  # fixed-slope
                tilt = lat
            else:
                tilt = 0  # could eventually be southern component of ground slope for trackers
            for year in years:
                cap_factors = get_cap_factors(nsrdb_file_dict[lat, lon, year],
                                              array_type,
                                              azimuth=azimuth,
                                              tilt=tilt,
                                              gcr=gcr)
                # note: iterrows() would convert everything to a single (float) series, but itertuples doesn't
                for clst in grp.itertuples():
                    contrib = cap_factors * clst.solar_area / cluster_total_solar_area[
                        clst.cluster_id]
                    key = (tech, clst.cluster_id, year)
                    if key in cluster_cap_factors:
                        cluster_cap_factors[key] += contrib
                    else:
                        cluster_cap_factors[key] = contrib

    # get timesteps for each year (based on lat and lon of first cell in the list)
    timesteps = dict()
    lat = round_coord(cluster_cell['nsrdb_lat'].iloc[0])
    lon = round_coord(cluster_cell['nsrdb_lon'].iloc[0])
    for year in years:
        timesteps[year] = get_timesteps(nsrdb_file_dict[(lat, lon, year)])

    # make an index of all timesteps
    timestep_index = pd.concat([
        pd.DataFrame(index=x) for x in timesteps.values()
    ]).index.sort_values()

    # make a single dataframe to hold all the data
    cap_factor_df = pd.DataFrame(index=timestep_index,
                                 columns=pd.MultiIndex.from_product(
                                     [technologies, site_ids]),
                                 dtype=float)

    # assign values to the dataframe
    for ((tech, cluster_id, year),
         cap_factors) in cluster_cap_factors.iteritems():
        cap_factor_df.update(
            pd.DataFrame(cap_factors,
                         index=timesteps[year],
                         columns=[
                             (tech,
                              'PV_' + str(cluster_id).zfill(cluster_id_digits))
                         ]))
    cap_factor_df.columns.names = ['technology', 'site']
    cap_factor_df.index.names = ['date_time']

    # add load_zone and orientation to the index
    cap_factor_df['load_zone'] = load_zone
    cap_factor_df['orientation'] = 'na'
    cap_factor_df.set_index(['load_zone', 'orientation'],
                            append=True,
                            inplace=True)
    # convert to database orientation, with natural order for indexes,
    # but also keep as a DataFrame
    cap_factor_df = pd.DataFrame(
        {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)})
    # sort table, then switch to using z, t, s, o as index (to match with project table)
    cap_factor_df = cap_factor_df.reorder_levels(
        ['load_zone', 'technology', 'site', 'orientation',
         'date_time']).sort_index().reset_index('date_time')

    # make a dataframe showing potential projects (same structure as "project" table)
    # note: for now we don't really handle multiple load zones and we don't worry about orientation
    # (may eventually have projects available with different azimuth and slope)
    # This concatenates a list of DataFrames, one for each technology
    project_df = pd.concat(
        [
            pd.DataFrame(
                dict(
                    load_zone=load_zone,
                    technology=tech,
                    site=site_ids,
                    orientation='na',
                    max_capacity=cluster_total_solar_area *
                    central_solar_techs.loc[tech, 'mw_per_m2'],
                    latitude=cluster_coords['latitude'],
                    longitude=cluster_coords['longitude'],
                )) for tech in technologies
        ],
        axis=0).set_index(['load_zone', 'technology', 'site', 'orientation'])

    # store data in postgresql tables
    shared_tables.create_table("project")
    execute("DELETE FROM project WHERE technology IN %s;",
            [tuple(technologies)])
    project_df.to_sql('project', db_engine, if_exists='append')

    # retrieve the project IDs (created automatically in the database)
    project_ids = pd.read_sql(
        "SELECT project_id, load_zone, technology, site, orientation " +
        "FROM project WHERE technology IN %(techs)s;",
        db_engine,
        index_col=['load_zone', 'technology', 'site', 'orientation'],
        params={'techs': tuple(technologies)})
    cap_factor_df['project_id'] = project_ids['project_id']

    # convert date_time values into strings for insertion into postgresql.
    # Inserting a timezone-aware DatetimeIndex into postgresql fails; see
    # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061
    # note: the string conversion is pretty slow
    cap_factor_df['date_time'] = pd.DatetimeIndex(
        cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z")

    cap_factor_df.set_index(['project_id', 'date_time'], inplace=True)
    # Do we need error checking here? If any projects aren't in cap_factor_df, they'll
    # create single rows with NaNs (and any prior existing cap_factors for them will
    # get dropped below).
    # If any rows in cap_factor_df aren't matched to a project, they'll go in with
    # a null project_id.

    shared_tables.create_table(
        "cap_factor")  # only created if it doesn't exist
    shared_tables.drop_indexes(
        "cap_factor")  # drop and recreate is faster than incremental sorting
    execute("DELETE FROM cap_factor WHERE project_id IN %s;",
            [tuple(project_ids['project_id'])])
    cap_factor_df.to_sql('cap_factor',
                         db_engine,
                         if_exists='append',
                         chunksize=10000)
    shared_tables.create_indexes("cap_factor")