Python KMeans.find_centersの例

プログラミング言語: Python

名前空間/パッケージ名: k_means

クラス/型: KMeans

メソッド/関数: find_centers

hotexamples.comのコード掲載数: 2

Python KMeans.find_centers - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのk_means.KMeans.find_centersの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

KMeans(30)

fit(9)

cluster(7)

train(7)

run(6)

calculate_performance(4)

get_centroids(2)

print_results(2)

main_loop(2)

predict(2)

open_dataset(1)

k_means(1)

kmeans(1)

train_model(1)

print_distance(1)

plot_objective_function(1)

step(1)

initCentroids(1)

report(1)

run_k_means(1)

start(1)

init_centers(1)

get_clusters(1)

get_sse_score(1)

classify_centroids(1)

_get_initial_centroids(1)

_is_finish(1)

_squared_euclidian_dist(1)

_update_centroids_and_data(1)

calc_err(1)

calculate_sse(1)

centroids_find_closest(1)

cluster_points(1)

get_msc_avg(1)

compute_sse(1)

evaluate(1)

findClosestCentroids(1)

find_centers(1)

fit_predict(1)

get_cluster(1)

_choose_cluster(1)

update_k(1)

コード例 #1

ファイルを表示

ファイル: TrackingPV_Duffie_Horizontal_NS_aligned_Panels_and_Troughs_9.py プロジェクト: switch-model/switch-hawaii-studies

# read in cell-level data
# table contains site_id, grid_id, i, j, central_area, net_pv_capacity.
with open('cell_central_pv_capacity_original.csv') as csvfile:
    data = list(csv.DictReader(csvfile))
    x, y, area = np.array(list((r["i"], r["j"], r["central_area"]) for r in data), dtype=float).T

# data = csv_to_dict('cell_central_pv_capacity_original.csv')
# i = np.array(data["i"], dtype=float)
# j = np.array(data["j"], dtype=float)
# area = np.array(data["central_area"], dtype=float)

# cluster the cells into 150 projects (somewhat arbitrarily) instead of ~750,
# and use the cluster numbers as new site_id's.
km = KMeans(150, np.c_[x, y], size=0.0001*area)
km.init_centers()
km.find_centers()
# km.plot()
for i in range(len(x)):
    # km.cluster_id is an array of cluster id's, same length as x and y
    data[i]["cluster_id"] = km.cluster_id[i]

# insert the modified data into the database
# note: it is reportedly faster to construct a single 
# insert query with all the values using python's string
# construction operators, since executemany runs numerous 
# separate inserts. However, it's considered more secure to use 
# the database library's template substitution, so we do that.
executemany("""
    INSERT INTO cell_central_pv_capacity
    (cluster_id, site_id, grid_id, i, j, central_area, net_pv_capacity)
    VALUES (

コード例 #2

ファイルを表示

def distributed_pv():
    # TODO: break up the major sub-sections of the main loop into separate functions
    # TODO: merge the code that gets capacity factors for each configuration here
    # with the code that gets capacity factors for each cell for utility-scale PV.
    # TODO: write a general function that adds a block of projects and capacity
    # factors to the postgresql database (including reading back the project IDs),
    # and use that for distributed PV, utility-scale PV and wind projects

    # read roof areas from load_zone_grid_cell.csv
    # treat any NaNs (missing data) as 0.0 coverage
    all_cells = pd.DataFrame.from_csv(
        db_path('GIS/General/load_zone_nsrdb_cell.csv'),
        index_col=('load_zone', 'nsrdb_id')).fillna(0.0)

    # make sure tables exist, and clear out existing DistPV data;
    # the loops below will add records back to this table, one load zone at a time
    shared_tables.create_table("project")
    shared_tables.create_table("cap_factor")
    shared_tables.drop_indexes(
        "cap_factor")  # drop and recreate is faster than incremental sorting
    execute("""
        DELETE FROM cap_factor c 
            USING project p 
            WHERE c.project_id=p.project_id AND p.technology='DistPV';
        DELETE FROM project
            WHERE technology='DistPV';
        DROP TABLE IF EXISTS dist_pv_details;")
    """)

    # calculate hourly capacity factor for all dist pv configurations
    # for each cell in each load zone
    for lz in load_zones:
        lz_cells = all_cells.loc[lz, :]
        lz_cells = lz_cells[lz_cells.roof_area > 0.0]
        # create an array to hold hourly capacity factors for all cells in this load zone
        # it will end up with one row for each cell and one column for each hour
        cap_factors = None
        for cell_n, cell in enumerate(lz_cells.itertuples()):
            cell_capacities, cell_cap_factors = get_dist_pv_cap_factors(
                cell.nsrdb_lat, cell.nsrdb_lon, cell.roof_area)
            # note: this is the first time when we know how many configurations
            # and timesteps there are, so this is when we create the cap_factors array
            if cap_factors is None:
                capacities = np.empty((len(lz_cells), ) +
                                      cell_capacities.shape)
                cap_factors = np.empty((len(lz_cells), ) +
                                       cell_cap_factors.shape)
                # fill them with nans, so we'll see if any aren't filled later
                capacities.fill(np.nan)
                cap_factors.fill(np.nan)
            capacities[cell_n, :] = cell_capacities
            cap_factors[cell_n, :, :] = cell_cap_factors

        # reshape into a long list of resources instead of a cell x config matrix
        capacities = capacities.reshape((-1, ))
        cap_factors = cap_factors.reshape((-1, cap_factors.shape[2]))

        # cluster available resources into 20 tranches with similar timing and quality
        # (we assume the better-suited ones will be developed before the worse ones)
        # (This could be sped up by using a subsample of the timesteps if needed, but then
        # the cluster means would have to be calculated afterwards.)
        # an alternative approach would be to cluster resources based on annual average
        # capacity factor, but that neglects differences in timing between different
        # orientations.
        km = KMeans(20, X=cap_factors, size=capacities)
        import time
        start = time.time()
        km.init_centers()  # 3 mins
        print("init_centers(): {} s".format(time.time() - start))
        start = time.time()
        km.find_centers()  # 9 mins
        print("find_centers(): {} s".format(time.time() - start))
        # now km.mu is a matrix of capacity factors, with one row per cluster
        # and one column per timestep
        # and km.cluster_id shows which cluster each resource belongs to

        cluster_capacities = np.bincount(km.cluster_id, weights=capacities)
        cluster_cap_factors = km.mu.T

        # PROJECT TABLE

        # store project definitions and capacity factors
        project_df = pd.DataFrame.from_items([
            ('load_zone', load_zone), ('technology', 'DistPV'),
            ('site',
             ['Oahu_DistPV_' + str(i)
              for i in range(len(cluster_capacities))]), ('orientation', 'na'),
            ('max_capacity', cluster_capacities), ('connect_cost_per_mw', 0.0)
        ]).set_index(['load_zone', 'technology', 'site', 'orientation'])
        project_df.to_sql('project', db_engine, if_exists='append')

        # CAP_FACTOR TABLE

        # get timesteps for each year (based on lat and lon of last cell in the list)
        timesteps = [
            get_timesteps(nsrdb_file_dict[(cell.nsrdb_lat, cell.nsrdb_lon,
                                           year)]) for year in years
        ]
        # make an index of all timesteps
        timestep_index = pd.concat(
            (pd.DataFrame(index=x) for x in timesteps)).index.sort_values()

        # make an index of all site_ids
        # TODO: change this code and project_df code to zero-fill site numbers up to 2 digits
        # (enough to cover the number of tranches in each zone)
        site_ids = [
            load_zone + '_DistPV_' + str(i)
            for i in range(cluster_cap_factors.shape[1])
        ]

        # multiindex of load_zone, technology, site, orientation
        proj_index = pd.MultiIndex.from_product([[load_zone], ['DistPV'],
                                                 site_ids, ['na']])

        # make a single dataframe to hold all the data
        cap_factor_df = pd.DataFrame(
            cluster_cap_factors,
            index=timestep_index,
            columns=proj_index,
        )
        cap_factor_df.columns.names = [
            'load_zone', 'technology', 'site', 'orientation'
        ]
        cap_factor_df.index.names = ['date_time']

        # convert to database orientation, with natural order for indexes,
        # but also keep as a DataFrame
        cap_factor_df = pd.DataFrame(
            {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)})
        # sort table, then switch to using z, t, s, o as index (to match with project table)
        # (takes a few minutes)
        cap_factor_df = cap_factor_df.reorder_levels(
            ['load_zone', 'technology', 'site', 'orientation',
             'date_time']).sort_index().reset_index('date_time')

        # retrieve the project IDs (created automatically in the database earlier)
        # note: this read-back could potentially be done earlier, and then the
        # project ids could be included in cap_factor_df when it is first created.
        # but this provides cross-referencing by z, t, s, o automatically, which is helpful.
        project_ids = pd.read_sql(
            "SELECT project_id, load_zone, technology, site, orientation " +
            "FROM project WHERE technology = 'DistPV';",
            db_engine,
            index_col=['load_zone', 'technology', 'site', 'orientation'])
        cap_factor_df['project_id'] = project_ids['project_id']

        # convert date_time values into strings for insertion into postgresql.
        # Inserting a timezone-aware DatetimeIndex into postgresql fails; see
        # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061
        # note: the string conversion is pretty slow
        cap_factor_df['date_time'] = pd.DatetimeIndex(
            cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z")

        cap_factor_df.set_index(['project_id', 'date_time'], inplace=True)
        # Do we need error checking here? If any projects aren't in cap_factor_df, they'll
        # create single rows with NaNs (and any prior existing cap_factors for them will
        # get dropped below).
        # If any rows in cap_factor_df aren't matched to a project, they'll go in with
        # a null project_id.

        # The next line is very slow. But it only seems possible to speed it up by
        # copying the data to a csv and doing a bulk insert, which is more cumbersome.
        # progress can be monitored via this command in psql:
        # select query from pg_stat_activity where query like 'INSERT%';
        cap_factor_df.to_sql('cap_factor',
                             db_engine,
                             if_exists='append',
                             chunksize=10000)

        # DIST_PV_DETAILS TABLE

        # store cluster details for later reference
        # would be interesting to see mean and stdev of lat, lon,
        # cap factor, azimuth, tilt for each cluster, so we can describe them.
        dist_pv_details = pd.Panel(
            {
                'capacity_mw':
                capacities.reshape((len(lz_cells), -1)),
                'site': ('Oahu_DistPV_' +
                         km.cluster_id.astype(str).astype(np.object)).reshape(
                             (len(lz_cells), -1))
            },
            major_axis=[lz_cells[col] for col in ['nsrdb_lat', 'nsrdb_lon']],
            minor_axis=[
                dist_pv_configs[col] for col in dist_pv_configs.columns
            ]).to_frame().reset_index()
        dist_pv_details.insert(0, 'load_zone', load_zone)

        # store in postgresql database
        dist_pv_details.to_sql('dist_pv_details',
                               db_engine,
                               if_exists='append')

    # restore indexes, final cleanup
    shared_tables.create_indexes("cap_factor")
    execute("ALTER TABLE dist_pv_details OWNER TO admin;")