def init(): asl = AslDb() #dimensions hand = ['right', 'left'] side = ['r', 'l'] cartesian = ['x', 'y'] polar = ['r', 'theta'] #rename the raw data for consistency raw_names = { h + '-' + c: 'raw-' + h[0] + c for h in hand for c in cartesian } asl.df = asl.df.rename(columns=raw_names) cartesian_features = ['grnd', 'norm', 'delta'] features = { k: [k + '-' + h[0] + c for h in hand for c in cartesian] for k in cartesian_features } features['polar'] = ['polar' + '-' + s + c for s in side for c in polar] #derive the features for f in features['grnd']: asl.df[f] = asl.df['raw' + f[-3:]] - asl.df['nose-' + f[-1:]] df_means = asl.df.groupby('speaker').mean() df_std = asl.df.groupby('speaker').std() for f in features['norm']: ref = 'raw' + f[-3:] asl.df[f] = (asl.df[ref] - asl.df['speaker'].map( df_means[ref])) / asl.df['speaker'].map(df_std[ref]) for f in features['delta']: ref = 'grnd' + f[-3:] asl.df[f] = (asl.df[ref].diff()).fillna(0) ref = 'grnd' asl.df['polar-rtheta'] = (np.arctan2(asl.df[ref + '-rx'], asl.df[ref + '-ry'])) asl.df['polar-ltheta'] = (np.arctan2(asl.df[ref + '-lx'], asl.df[ref + '-ly'])) asl.df['polar-rr'] = np.sqrt(asl.df[ref + '-rx']**2 + asl.df[ref + '-ry']**2) asl.df['polar-lr'] = np.sqrt(asl.df[ref + '-lx']**2 + asl.df[ref + '-ly']**2) training = {k: asl.build_training(v) for k, v in features.items()} xlens = training['grnd'].get_all_Xlengths() lens_stats = [(k, len(v[1]), min(v[1]), sum(v[1]) / len(v[1]), max(v[1]), max(v[1]) - min(v[1])) for k, v in xlens.items()] words_stats = pd.DataFrame.from_records( lens_stats, columns=['word', 'count', 'min', 'avg', 'max', 'range']).set_index('word') words_stats['spread'] = words_stats['range'] / words_stats['avg'] #include all words min_len = 0 words = words_stats[words_stats['min'] > min_len].sort_values( by='count', ascending=False).index.tolist() samples = dict() for f in features: samples[f] = {k: get_word(training, features, f, k) for k in words} threshold = 1e-9 separated = { k: ([s for s in v if min(s.std()) < threshold], [s for s in v if min(s.std()) > threshold]) for k, v in samples['norm'].items() } separated_stats = pd.DataFrame.from_records( {k: (len(v[0]), len(v[1])) for k, v in separated.items()}).T.rename(columns={ 0: 'single', 1: 'double' }) return asl, features, training, samples, words_stats.join( separated_stats), separated
asl.df['grnd-ry'] = asl.df['right-y'] - asl.df['nose-y'] asl.df['grnd-rx'] = asl.df['right-x'] - asl.df['nose-x'] asl.df['grnd-ly'] = asl.df['left-y'] - asl.df['nose-y'] asl.df['grnd-lx'] = asl.df['left-x'] - asl.df['nose-x'] # collect the features into a list groud features features_ground = ['grnd-rx', 'grnd-ry', 'grnd-lx', 'grnd-ly'] # Normaized features features_norm = ['norm-rx', 'norm-ry', 'norm-lx', 'norm-ly'] lookup = asl.df.groupby('speaker').transform(lambda df: (df - df.mean()) / df.std()) asl.df = asl.df.assign(**{'norm-rx': lookup['right-x'], 'norm-lx': lookup['left-x'], 'norm-ry': lookup['right-y'], 'norm-ly': lookup['left-y']}) # Polar features features_polar = ['polar-rr', 'polar-rtheta', 'polar-lr', 'polar-ltheta'] asl.df['polar-lr'] = np.sqrt((asl.df['left-x'] - asl.df['nose-x']) ** 2 + (asl.df['left-y'] - asl.df['nose-y']) ** 2) asl.df['polar-rr'] = np.sqrt((asl.df['right-x'] - asl.df['nose-x']) ** 2 + (asl.df['right-y'] - asl.df['nose-y']) ** 2) asl.df['polar-ltheta'] = np.arctan2((asl.df['left-x'] - asl.df['nose-x']), (asl.df['left-y'] - asl.df['nose-y'])) asl.df['polar-rtheta'] = np.arctan2((asl.df['right-x'] - asl.df['nose-x']), (asl.df['right-y'] - asl.df['nose-y'])) # Delta features features_delta = ['delta-rx', 'delta-ry', 'delta-lx', 'delta-ly'] asl.df['delta-rx'] = asl.df['right-x'].diff().fillna(0) asl.df['delta-ry'] = asl.df['right-y'].diff().fillna(0) asl.df['delta-lx'] = asl.df['left-x'].diff().fillna(0)