def compute_events(data_dir, start_key, end_key, label='', output_dir=None): if output_dir != None: make_dir(output_dir) df_merged = pd.DataFrame() for csv in os.listdir(data_dir): ha = HeadAnnotator() if not csv.find('.csv') == -1: fi_path = '%s/%s' % (data_dir, csv) df = pd.read_csv(fi_path) ha = HeadAnnotator() event_hash, event_list = ha.annotate_events(df) df_p = ha.df og_cols = df.columns.tolist() fe_cols = df_p.columns.tolist() print event_hash for c in og_cols: if not c in fe_cols: df_p[c] = df[c] if output_dir != None: sub_dir = '%s/events' % output_dir make_dir(sub_dir) # assume start key and end key event s have the same for i in xrange(len(event_hash[start_key])): if len(event_hash[end_key]) > i: start = event_hash[start_key][i] end = event_hash[end_key][i] df_sub = df_p.loc[start:end] df_sub['original_index'] = df_sub.index # add a class so the training data is 'labeled' df_sub['turn_sentiment'] = label print csv df_sub.to_csv('%s/%s-%s.csv' % (sub_dir, csv.split('.')[0], i), index=False) if output_dir != None: df_p.to_csv('%s/%s' % (output_dir, csv), index=False) visualize.plot_diagnostics(df_p, ha.active_features, '%s/%s' % (output_dir,csv.split('.')[0]), y_col='noseX') df_merged = pd.concat([df_merged, df_p]) return df_merged
def print_test_data(m_df, cf, cf_string): Y_test = cf.predict(m_df[active_cols]) print cf_string, ' accuracy', \ np.sum(Y_test == m_df['class']) / float(len(m_df)) df_test['class'] = Y_test plot_diagnostics(df_test, active_cols, '%s/head-turn-test-%s' % (base_dir, cf_string))
# the directory (labeled data) we are looking to load head turns from data_dir = sys.argv[1] # number of clusters to cluster the data on k = int(sys.argv[2]) base_dir = "data" # dir to read data from m_dir = "%s/%s" % (base_dir, data_dir) output_dir = "data/merged" ignore_columns = [ "date", "frameIndex", "class", "time", "noseX_raw", "noseY_raw", "faceBottom", "faceTop", "faceLeft", "faceRight", "noseX", "noseY", "isFrontFace", ] df, active_features = generate_training_set( m_dir, k=k, window_size=window_size, relevant_features=relevant_features ) df.to_csv("%s/%s.csv" % (output_dir, data_dir), index=False) plot_diagnostics(df, active_features, "%s/%s" % (output_dir, data_dir)) config["active_features"] = active_features with open("config.json", "w") as outfile: json.dump(config, outfile)