Esempio n. 1
0
    def annotate_events(self, df):
        window_size = self.window_size
        active_features = self.active_features
        og_cols = df.columns.tolist()
        df_feat, features = head_features.apply_feature_engineering(
                df,
                relevant_features=self.relevant_features,
                )
        df_feat.fillna(0, inplace=True)
        df_w, feats = head_features.generate_windows(
            df_feat, 
            window = window_size,
            relevant_features=features,
            )
        # Cut off the tail end of the data (lots of null values)
        df_w = df_w.loc[0:(len(df_w)-window_size)]
        Y = self.model.predict(df_w[active_features])
        df_w['class'] = Y
        for c in og_cols:
            if not c in active_features:
                df_w[c] = df[c]

        self.df = df_w
        print Y
        # These are the raw events
        return self.find_true_events(df, Y.tolist(), index_col='frameIndex')
Esempio n. 2
0
def generate_training_set(
    director,
    feature_generator=lambda df, cols: head_features.apply_feature_engineering(df, cols),
    k=4,
    window_size=10,
    relevant_features=[],
    verbose=True,
):
    """
    Given the directory of data files,
    cluster the event points and saves the results
    in the merged folder. It appends an extra `class` column
    to represente the labeling results.
    """
    training_data = pd.DataFrame()
    active_features = []
    for csv in os.listdir(director):
        if not csv.find(".csv") == -1:
            fi_path = "%s/%s" % (director, csv)
            df = pd.read_csv(fi_path)
            if verbose:
                print fi_path
            # Save to raw so the original data is kept. We are interested in
            # keeping the original data for these rows
            df["noseX_raw"] = df["noseX"]
            df["noseY_raw"] = df["noseY"]
            # features
            df, active_features = feature_generator(df, relevant_features)
            df.fillna(0, inplace=True)
            df_w, active_features = head_features.generate_windows(
                df, window=window_size, relevant_features=active_features
            )
            df_trimmed = df_w.loc[0 : (len(df_w) - window_size)]
            # cluster individiaul files
            Y = cluster_training_signals(df_trimmed, active_features, k)
            df_trimmed["class"] = Y
            training_data = pd.concat([training_data, df_trimmed])
    df_w = training_data
    if verbose:
        print "Now clustering the data"
        # active_columns = get_active_features(df, ignore_columns)

        # cluster all together
        print "Number of data points clustered:", len(df_w)
        print "Features used to cluster:\n"
        for c in active_features:
            print "\t%s" % c
    return df_w, active_features