Python read_data_from_csvの例、load_data.read_data_from_csv Pythonの例

コード例 #1

0

ファイルを表示

def main():
    ######################
    ###### SETTINGS ######
    ######################

    clustered_data_folder = "../Data_Clustered/"  # Base folder of clustered data
    filename = "MayDec2015_htv.csv"  # The file to load
    feature = SourceFeatures.SOURCEHTAQNV

    args = parse_args()
    source_stability = args["source_stability"]

    output_filename = filename + feature + str(source_stability) + ".csv"

    ######################
    ######## CODE ########
    ######################

    # Load file into a data frame
    path = clustered_data_folder + filename
    df = ld.read_data_from_csv(path, None, None)
    df = ld.fill_columns(df, None, fill_nan_with_zeros=True)
    df = ld.convert_column_types(df)

    selector_stability = df[
        ProcessingFeatures.SOURCE_STABILITY] == source_stability
    selector_running = df[ProcessingFeatures.SOURCE_RUNNING] == 1

    df_new = df.loc[selector_stability & selector_running, feature].copy()
    df_new.to_csv(output_filename, header=True)

コード例 #2

0

ファイルを表示

ファイル: train_model.py プロジェクト: sunyx1223/Wind-Solar

def train():
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()
        # Define the filename_queue here
        filenames = []
        filenames_queue = tf.train.string_input_producer(filenames)

        #Get images and labels for training
        features, label = load_data.read_data_from_csv(filename_queue)

        input_batch, label_batch = load_data.make_batch(
            features, label, min_queue_examples=1000, batch_size=100)

        # Build a graph that computes predicted energy
        predicted_energy = main_functions.neural_net(input_batch)

        # Calculate loss
        loss = main_functions.loss(input_batch, label_batch)

        #Build a graph that trains the model with one batch of data
        # and updates the model parameters
        train_op = main_functions.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """This class logs loss and runtime"""
            def begin(self):
                self._step = -1

            def before_run(self,
                           run_context):  #Asks for loss value before each run
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)

            #runs to fetch loss

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    examples_per_sec = FLAGS.batch_size / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)

コード例 #3

0

ファイルを表示

ファイル: ht_spark_test.py プロジェクト: mexxexx/ionsrcopt

def main():
    input_file = "../Data_Raw/Nov2018.csv"
    columns = [
        SourceFeatures.TIMESTAMP,
        SourceFeatures.BCT05_CURRENT,
        SourceFeatures.SOURCEHTAQNV,
        SourceFeatures.SOURCEHTAQNI,
        SourceFeatures.SPARK_COUNTER,
    ]

    df = ld.read_data_from_csv(input_file, columns, None)
    df = ld.fill_columns(df, None, fill_nan_with_zeros=True)
    df = ld.convert_column_types(df)

    source_running = calculate_source_running(df[SourceFeatures.BCT05_CURRENT])
    window_size = 20
    threshold = 0.25
    breakdowns = detect_breakdowns(df, SourceFeatures.SOURCEHTAQNI,
                                   window_size, threshold).astype("int64")

    threshold = 1000
    df[ProcessingFeatures.HT_SPARKS_COUNTER] = detect_sparks(
        df[SourceFeatures.SOURCEHTAQNV], breakdowns, threshold)
    df.loc[df[ProcessingFeatures.HT_SPARKS_COUNTER] == 0,
           ProcessingFeatures.HT_SPARKS_COUNTER, ] = np.nan
    df.loc[df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0,
           ProcessingFeatures.HT_SPARKS_COUNTER, ] = np.arange(
               1, (df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0).sum() + 1)
    df[ProcessingFeatures.HT_SPARKS_COUNTER] = df[
        ProcessingFeatures.HT_SPARKS_COUNTER].ffill()

    df.loc[df[SourceFeatures.SPARK_COUNTER] ==
           df[SourceFeatures.SPARK_COUNTER].shift(1),
           SourceFeatures.SPARK_COUNTER, ] = np.nan
    df.loc[df[SourceFeatures.SPARK_COUNTER] == 0,
           SourceFeatures.SPARK_COUNTER] = np.nan
    df.loc[df[SourceFeatures.SPARK_COUNTER] > 0,
           SourceFeatures.SPARK_COUNTER] = np.arange(
               1, (df[SourceFeatures.SPARK_COUNTER] > 0).sum() + 1)
    df[SourceFeatures.SPARK_COUNTER] = df[SourceFeatures.SPARK_COUNTER].ffill()

    fig, ax = plt.subplots(2, 1, sharex=True)

    ax_htv = ax[0].twinx()
    ax_hti = ax[0].twinx()
    ax_hti.spines["right"].set_position(("axes", 1.04))
    # make_patch_spines_invisible(par2)
    ax_hti.spines["right"].set_visible(True)

    # ax[0].plot(df[ProcessingFeatures.HT_SPARKS_COUNTER], color='red')
    ax[0].plot(df[SourceFeatures.BCT05_CURRENT], color="red")
    ax_htv.plot(df[SourceFeatures.SOURCEHTAQNV])
    ax_hti.plot(df[SourceFeatures.SOURCEHTAQNI], color="orange")

    sparks_real = df[SourceFeatures.SPARK_COUNTER]
    ax12 = ax[1].twinx()
    ax[1].plot(df[ProcessingFeatures.HT_SPARKS_COUNTER], color="red")
    ax[1].plot(sparks_real, color="orange")

    plt.show()

コード例 #4

0

ファイルを表示

ファイル: stability_visualization.py プロジェクト: mexxexx/ionsrcopt

def main():
    ######################
    ###### SETTINGS ######
    ######################

    clustered_data_folder = "../Data_Clustered/"  # Base folder of clustered data
    filename = "JanNov2018.csv"  # The file to load

    ######################
    ######## CODE ########
    ######################

    columns = [
        SourceFeatures.TIMESTAMP,
        SourceFeatures.BCT25_CURRENT,
        ProcessingFeatures.SOURCE_STABILITY,
    ]

    # Load file into a data frame
    path = clustered_data_folder + filename
    df = ld.read_data_from_csv(path, columns, None)
    df = ld.fill_columns(df, None, fill_nan_with_zeros=True)
    df = ld.convert_column_types(df)

    dates_stable = matplotlib.dates.date2num(
        df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 1].index.values)
    dates_unstable = matplotlib.dates.date2num(
        df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 0].index.values)

    fig = plt.figure()
    ax = fig.add_subplot("111")
    ax.plot_date(
        dates_unstable,
        df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 0,
               SourceFeatures.BCT25_CURRENT].values,
        fmt=".",
        c="red",
        markersize=1,
    )
    ax.plot_date(
        dates_stable,
        df.loc[df[ProcessingFeatures.SOURCE_STABILITY] == 1,
               SourceFeatures.BCT25_CURRENT].values,
        fmt=".",
        c="black",
        markersize=1,
    )
    ax.set_ylim(-0.01, 0.08)
    ax.set_ylabel("BCT25 current [µA]")

    figManager = plt.get_current_fig_manager()
    figManager.window.showMaximized()
    plt.subplots_adjust(left=0.05,
                        bottom=0.05,
                        right=0.95,
                        top=0.93,
                        wspace=None,
                        hspace=0.4)
    plt.show()

コード例 #5

0

ファイルを表示

def main(plot):
    year = 2016
    start_month = "Jan"
    end_month = "Nov"

    data_path = "../Data_Raw"

    for i, m in enumerate(
        months[months.index(start_month) : months.index(end_month) + 1]
    ):
        file_path = f"{data_path}/{m}{year}.csv"
        print(f"HT sparks for {file_path}")

        previous_month_file = None
        if i > 0:
            m_prev = months[months.index(m) - 1]
            previous_month_file = f"{data_path}/{m_prev}{year}_htv.csv"

        df = ld.read_data_from_csv(file_path, None, None)
        df = ld.fill_columns(df, previous_month_file, fill_nan_with_zeros=True)
        # df = ld.convert_column_types(df)

        # First we mark all time periods where the variance of the HT current is
        # above a certain threshold to exclude all these windows from our analysis
        window_size = 40
        threshold_breakdowns = 0.25
        breakdowns = detect_breakdowns(
            df, SourceFeatures.SOURCEHTAQNI, window_size, threshold_breakdowns
        ).astype("int64")

        # Then we search for all downward spikes in the HT voltage that fall below 1000V
        # and have a prominence of 500V, i.e. are significant compared to the background.
        # The are the actual sparks and can be compared with IP.NSRCGEN:SPARKS for 2018
        threshold_sparks = 1000
        sparks = detect_sparks(
            df[SourceFeatures.SOURCEHTAQNV], breakdowns, threshold_sparks
        )

        df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] = breakdowns
        df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] = df[
            ProcessingFeatures.HT_VOLTAGE_BREAKDOWN
        ].astype("Int32")

        df[ProcessingFeatures.HT_SPARKS_COUNTER] = sparks
        df[ProcessingFeatures.HT_SPARKS_COUNTER] = df[
            ProcessingFeatures.HT_SPARKS_COUNTER
        ].astype("Int32")
        # df.loc[df[ProcessingFeatures.HT_SPARKS_COUNTER] == 0, ProcessingFeatures.HT_SPARKS_COUNTER] = np.nan

        if plot:
            plot_breakdowns(df)

        mask = (df.shift(1) == df).fillna(value=True).astype(bool)
        df = df.where(~mask, np.nan)

コード例 #6

0

ファイルを表示

ファイル: cluster_sparks.py プロジェクト: mexxexx/ionsrcopt

def main(input_file, output_file):
    folder = "../Data_Clustered/"
    input_file = f"{folder}{input_file}.csv"
    output_file = f"{folder}{output_file}.csv"

    df = ld.read_data_from_csv(input_file, None, None)

    df = fill_columns(df)
    df = reset_breakdown_clusters(df)
    df = assign_clusters(df)

    mask = (df.shift(1) == df).fillna(value=True).astype(bool)
    df = df.where(~mask, np.nan)
    # df = df.round(4)

    df.to_csv(output_file)

コード例 #7

0

ファイルを表示

def main():
    ######################
    ###### SETTINGS ######
    ######################

    clustered_data_folder = "../Data_Clustered/"  # Base folder of clustered data
    filename = "JanNov2016.csv"  # The file to load

    features = [
        SourceFeatures.BIASDISCAQNV,
        SourceFeatures.GASAQN,
        SourceFeatures.OVEN1AQNP,
        SourceFeatures.THOMSON_FORWARDPOWER,
        SourceFeatures.SOLINJ_CURRENT,
        SourceFeatures.SOLCEN_CURRENT,
        SourceFeatures.SOLEXT_CURRENT,
        SourceFeatures.SOURCEHTAQNI,
        SourceFeatures.BCT25_CURRENT,
    ]  # Features to be displayed

    args = parse_args()
    source_stability = args["source_stability"]
    cluster = args["cluster"]
    sample_size = args["sample_size"]

    ######################
    ######## CODE ########
    ######################

    path = clustered_data_folder + filename
    df = ld.read_data_from_csv(path, None, None)
    df = ld.fill_columns(df, None, fill_nan_with_zeros=True)
    df = ld.convert_column_types(df)

    df = df.loc[(df[ProcessingFeatures.SOURCE_STABILITY] == source_stability)].copy()
    if not cluster is None:
        df = df.loc[(df[ProcessingFeatures.CLUSTER] == cluster)].copy()

    index_length = len(df.index)
    indices = np.random.permutation(range(index_length))[
        : min(sample_size, index_length)
    ]

    data = df.loc[df.index[indices]].copy()

    sns.pairplot(data, vars=features, hue=ProcessingFeatures.CLUSTER)
    plt.show()

コード例 #8

0

ファイルを表示

def main(year, source_stability, cluster, show_breakdowns):
    ######################
    ###### SETTINGS ######
    ######################

    if year == 2018:
        input_file = "../Data_Clustered/JanNov2018_sparks_clustered.csv"
        # features.append(SourceFeatures.SAIREM2_FORWARDPOWER)
    elif year == 2016:
        input_file = "../Data_Clustered/JanNov2016_sparks_clustered.csv"
        # features.append(SourceFeatures.THOMSON_FORWARDPOWER)

    features = [
        # SourceFeatures.BIASDISCAQNV,
        # SourceFeatures.GASAQN,
        # SourceFeatures.OVEN1AQNP,
        # SourceFeatures.OVEN2AQNP,
        # SourceFeatures.SOLINJ_CURRENT,
        # SourceFeatures.SOLCEN_CURRENT,
        SourceFeatures.SOLEXT_CURRENT,
        SourceFeatures.SOURCEHTAQNV,
        SourceFeatures.SOURCEHTAQNI,
    ]  # Features to be displayed

    features.append(SourceFeatures.BCT25_CURRENT)

    ######################
    ######## CODE ########
    ######################

    # Load file into a data frame
    df = ld.read_data_from_csv(input_file, None, None)
    df = ld.fill_columns(df, None, fill_nan_with_zeros=True)
    df = ld.convert_column_types(df)

    if cluster is not None:
        df = df[(df[ProcessingFeatures.CLUSTER] == cluster)].copy()
    df = df.loc[df[ProcessingFeatures.SOURCE_STABILITY] ==
                source_stability].copy()

    dates_nobreakdown = matplotlib.dates.date2num(
        df[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0].index)
    dates_breakdown = matplotlib.dates.date2num(
        df[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0].index)

    dates = df.index.values
    # datesIndices = np.arange(len(dates))

    df[SourceFeatures.BCT25_CURRENT] *= 1000

    fig, ax = plt.subplots(len(features), 1, sharex=True, figsize=(6, 6))
    for i, parameter in enumerate(features):
        # formatter = DateFormatter(dates)
        # ax[i].xaxis.set_major_formatter(formatter)
        ax[i].set_ylabel("{}".format(parameter), labelpad=40, fontsize=24)
        ax[i].set_xlabel("", labelpad=40, fontsize=24)
        ax[i].tick_params(axis="both", which="major", labelsize=22)

        if show_breakdowns:
            # ax[i].plot(datesIndices[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0], df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0, parameter].values, linestyle='', marker='.', markersize=1, color='#ff7f0e')
            ax[i].plot_date(
                dates_breakdown,
                df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] > 0,
                       parameter].values,
                linestyle="",
                marker=".",
                markersize=1,
                color="red",
            )

        ax[i].plot_date(
            dates_nobreakdown,
            df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0,
                   parameter].values,
            linestyle="",
            marker=".",
            markersize=1,
            color="black",
        )

        if show_breakdowns:
            ymin, ymax = ax[i].get_ylim()
            ax[i].vlines(
                df[df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0].index,
                ymin=ymin,
                ymax=ymax,
                color="black",
                ls="dashed",
                linewidths=1,
            )
        # ax[i].plot(datesIndices[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0], df.loc[df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0, parameter].values, linestyle='', marker='.', markersize=1, color='#1f77b4')
        ax[i].grid(True)
        ax[i].xaxis.set_major_locator(
            mdates.HourLocator(interval=24))  # to get a tick every 24 hours
        ax[i].xaxis.set_major_formatter(mdates.DateFormatter("%d-%m %H:00"))

    figManager = plt.get_current_fig_manager()
    # figManager.window.showMaximized()
    # fig.suptitle("Time development of cluster {}".format(cluster))
    plt.tight_layout()
    fig.align_ylabels()
    plt.show()

コード例 #9

0

ファイルを表示

ファイル: train_svm.py プロジェクト: azfar-imtiaz/GrammarCorrection

    class_1_count = 0
    for sent, label in zip(sents_tokenized_train, labels_train):
        if label == 0:
            train_data_eq.append(sent)
            train_labels_eq.append(label)
        elif label == 1:
            if class_1_count <= class_1_threshold:
                train_data_eq.append(sent)
                train_labels_eq.append(label)
                class_1_count += 1

    return train_data_eq, train_labels_eq


if __name__ == '__main__':
    sents_tokenized_train, labels_train = read_data_from_csv(
        config.cola_tokenized_tsv_filename_train)
    sents_tokenized_dev, labels_dev = read_data_from_csv(
        config.cola_tokenized_tsv_filename_dev)

    # equalize class counts
    sents_tokenized_train, labels_train = equalize_class_data(
        sents_tokenized_train, labels_train)

    # shuffle the data
    sents_tokenized_train, labels_train = shuffle_data(
        sents_tokenized_train, labels_train)

    cv = CountVectorizer(analyzer='word', tokenizer=dummy_placeholder_func,
                         preprocessor=dummy_placeholder_func, token_pattern=None, ngram_range=(1, 3))
    cvX_train = cv.fit_transform(sents_tokenized_train)
    cvX_dev = cv.transform(sents_tokenized_dev)

コード例 #10

0

ファイルを表示

ファイル: cluster_distributions.py プロジェクト: mexxexx/ionsrcopt

def main():
    ######################
    ###### SETTINGS ######
    ######################

    clustered_data_folder = "Data_Clustered/"  # Base folder of clustered data
    filename = "JanNov2018.csv"  # The file to load

    source_stability = 1  # 1 if we want to look at a stable source, 0 else
    cluster = 51  # The cluster to plot or None if you want to plot all data

    features = [
        SourceFeatures.BIASDISCAQNV,
        SourceFeatures.GASAQN,
        SourceFeatures.OVEN1AQNP,
        SourceFeatures.SAIREM2_FORWARDPOWER,
        SourceFeatures.SOLINJ_CURRENT,
        SourceFeatures.SOLCEN_CURRENT,
        SourceFeatures.SOLEXT_CURRENT,
        SourceFeatures.SOURCEHTAQNI,
        SourceFeatures.BCT25_CURRENT,
    ]  # Features to be displayed

    normalize = True  # Do we want to standard scale the data?
    bandwidth = np.array(
        [0.014, 0.011, 0.014, 0.014, 0.014, 0.014, 0.014, 0.014,
         0.014])  # bandwidth for unnormalized data
    # bandwidth = 0.02

    ######################
    ######## CODE ########
    ######################

    # Load file into a data frame
    path = clustered_data_folder + filename
    df = ld.read_data_from_csv(path, None, None)
    df = ld.fill_columns(df, None, fill_nan_with_zeros=True)
    df = ld.convert_column_types(df)

    df = df.loc[df[ProcessingFeatures.SOURCE_STABILITY] ==
                source_stability, :].copy()
    total_duration = df[ProcessingFeatures.DATAPOINT_DURATION].sum()

    data = df[features].values
    weights = df[ProcessingFeatures.DATAPOINT_DURATION].values
    if normalize:
        # data = (data - np.mean(data, axis=0)) / np.std(data, axis=0) #Standard scaling
        # data = (data - np.min(data, axis=0)) / (np.max(data, axis=0) - np.min(data, axis=0)) #MinMax scaling
        # data = data / np.max(np.absolute(data), axis=0) #Max scaling
        data = (data - np.median(data, axis=0)) / (
            np.quantile(data, q=0.9, axis=0) - np.quantile(data, q=0.1, axis=0)
        )  # Robust scaler

    if cluster is not None:
        data = data[df[ProcessingFeatures.CLUSTER] == cluster]
        weights = weights[df[ProcessingFeatures.CLUSTER] == cluster]

    resolution = 5000
    # if cluster is not None:
    #    bandwidth *= 0.2
    num_kde_samples = 40000
    cluster_duration = np.sum(weights)
    percentage_of_values = cluster_duration / total_duration

    plot_cluster(
        data,
        weights,
        features,
        feature_ranges=None,
        median=None,
        resolution=resolution,
        bandwidth=bandwidth,
        num_kde_samples=num_kde_samples,
        cluster=cluster,
        percentage_of_values=percentage_of_values,
    )

コード例 #11

0

ファイルを表示

ファイル: cluster_description.py プロジェクト: mexxexx/ionsrcopt

def main(
    year,
    source_stability,
    count_breakdowns_per_cluster,
    num_clusters_to_visualize,
    print_to_file,
    display_metrics,
):
    ######################
    ###### SETTINGS ######
    ######################

    features = [
        SourceFeatures.BIASDISCAQNV,
        SourceFeatures.GASAQN,
        SourceFeatures.OVEN1AQNP,
        # SourceFeatures.OVEN2AQNP,
        SourceFeatures.SOLINJ_CURRENT,
        SourceFeatures.SOLCEN_CURRENT,
        SourceFeatures.SOLEXT_CURRENT,
        SourceFeatures.SOURCEHTAQNI,
        SourceFeatures.BCT25_CURRENT,
    ]  # Features to load

    if year == 2018:
        input_file = "../Data_Clustered/JanNov2018_sparks_clustered.csv"
        output_file = "./Results/2018_{}_sparks.csv".format(source_stability)
        features.append(SourceFeatures.SAIREM2_FORWARDPOWER)
    elif year == 2016:
        input_file = "../Data_Clustered/JanNov2016_sparks_clustered.csv"
        output_file = "./Results/2016_{}_sparks.csv".format(source_stability)
        features.append(SourceFeatures.THOMSON_FORWARDPOWER)
    elif year == 2015:
        input_file = "../Data_Clustered/MayDec2015_sparks_clustered.csv"
        output_file = "./Results/2015_{}_sparks.csv".format(source_stability)
        features.append(SourceFeatures.THOMSON_FORWARDPOWER)

    statistics = ["median", "std%"]  # Statistics we are interested in

    ######################
    ######## CODE ########
    ######################

    # Load file into a data frame
    df = ld.read_data_from_csv(input_file, None, None)
    df = ld.fill_columns(df, None, fill_nan_with_zeros=True)
    df = ld.convert_column_types(df)

    for feature in features:
        if feature not in df.columns:
            print(
                "{} does not exist as a feature in the loaded file. Aborting.".format(
                    feature
                )
            )
            return

    # Calculate oven refills
    oven_refill_ends = calculate_oven_refill_ends(df[SourceFeatures.OVEN1AQNP])
    if year == 2018:
        oven_refill_ends = clear_refills_2018(oven_refill_ends)
    elif year == 2016:
        oven_refill_ends = clear_refills_2016(oven_refill_ends)

    print("There were {} oven refills.".format(len(oven_refill_ends)))

    # Select only the stability interested in
    df = df[df[ProcessingFeatures.SOURCE_STABILITY] == source_stability].copy()
    total_duration = df[ProcessingFeatures.DATAPOINT_DURATION].sum() / 3600

    # Describe the clusters
    print("Calculating statistics...")
    described = df.groupby(ProcessingFeatures.CLUSTER).apply(
        describe_cluster,
        features=features,
        weight_column=ProcessingFeatures.DATAPOINT_DURATION,
        oven_refills=oven_refill_ends,
    )
    described[("DENSITY", "percentage")] = (
        described[("DURATION", "in_hours")] / total_duration * 100
    )

    # Gather statistics to output
    wanted_statistics = get_wanted_statistics(features, statistics) + [
        ("DENSITY", "percentage"),
        ("DURATION", "in_hours"),
        ("DURATION", "longest_in_hours"),
        ("DURATION", "num_splits"),
        ("REFILL", "index"),
        ("REFILL", "delta_in_hours"),
    ]
    if count_breakdowns_per_cluster:
        wanted_statistics += [("num_breakdowns", "per_hour")]

    # Calculate metrics
    if display_metrics:
        metrics = calculate_metrics(df, features)
        print("DBI is {}".format(np.mean(metrics["DBI"])))
        described.loc[described.index >= 0, ("METRICS", "DBI")] = metrics["DBI"]
        print("Silhouette is {}".format(np.mean(metrics["silhouette"])))
        described.loc[described.index >= 0, ("METRICS", "silhouette")] = metrics[
            "silhouette"
        ]

        wanted_statistics += [("METRICS", "DBI"), ("METRICS", "silhouette")]

    described.sort_values(by=[("DENSITY", "percentage")], ascending=False, inplace=True)

    print("Rounding values...")
    printable_clusters = described[wanted_statistics].head(n=num_clusters_to_visualize)
    print(
        "Sum of densities of printed clusters: {:.1f}%".format(
            printable_clusters[("DENSITY", "percentage")].sum()
        )
    )
    print(
        "Sum of duration of printed clusters when source was running: {:.1f}".format(
            printable_clusters.loc[
                printable_clusters.index >= 0, ("DURATION", "in_hours")
            ].sum()
        )
    )
    printable_clusters = round_described(
        printable_clusters,
        {
            SourceFeatures.BIASDISCAQNV: 0,
            SourceFeatures.GASAQN: 2,
            SourceFeatures.OVEN1AQNP: 1,
            SourceFeatures.OVEN2AQNP: 1,
            SourceFeatures.THOMSON_FORWARDPOWER: 0,
            SourceFeatures.SAIREM2_FORWARDPOWER: 0,
            SourceFeatures.SOLINJ_CURRENT: 0,
            SourceFeatures.SOLCEN_CURRENT: 0,
            SourceFeatures.SOLEXT_CURRENT: 0,
            SourceFeatures.SOURCEHTAQNI: 2,
            SourceFeatures.BCT25_CURRENT: 3,
        },
    )
    printable_clusters.rename(
        {
            SourceFeatures.BIASDISCAQNV: "bias disc",
            SourceFeatures.GASAQN: "gas",
            SourceFeatures.OVEN1AQNP: "oven1",
            SourceFeatures.OVEN2AQNP: "oven2",
            SourceFeatures.SAIREM2_FORWARDPOWER: "RF",
            SourceFeatures.THOMSON_FORWARDPOWER: "RF",
            SourceFeatures.SOLINJ_CURRENT: "solinj",
            SourceFeatures.SOLCEN_CURRENT: "solcen",
            SourceFeatures.SOLEXT_CURRENT: "solext",
            SourceFeatures.SOURCEHTAQNI: "HTI",
            SourceFeatures.BCT25_CURRENT: "BCT25",
        },
        axis="columns",
        inplace=True,
    )

    if print_to_file:
        printable_clusters.to_csv(output_file)
        print("Saved result to {}".format(output_file))
    else:
        print(printable_clusters)