def preprocessing(): # get label y # y = pd.read_csv('F/FS_post.csv', header=0) # y = pd.read_csv('P_neg/panas_negative_post.csv', header=0) y = pd.read_csv('P_po/panas_positive_post.csv', header=0) y = np.array(y)[:, 1:] # use median value as the threshold thr = np.median(y) # divide y into two groups y = Binarizer(threshold=thr).fit_transform(y) y = y.reshape(-1) # get input X # X = pd.read_csv('F/input_top5.csv', header=0) # X = pd.read_csv('P_neg/input_top5.csv', header=0) X = pd.read_csv('P_po/input_top5.csv', header=0) X = np.array(X)[:, 1:].astype(np.float64) # use Pearson's Correlation to select useful features X = SelectKBest(lambda A, B: tuple(map(tuple, np.array(list(map(lambda a: pearsonr(a, B), A.T))).T)), k=10).fit_transform(X, y) return X, y
y = y.flatten() return X, y def get_column_names(path): with open(path) as fp: header = fp.readline().split(',') #[1:-1] return header X, y = get_training_data(data_path) letter_names = X[:, 0].reshape(-1, 1) letter_sounds = X[:, 1].reshape(-1, 1) # Binarize labels y = Binarizer(threshold=fail_threshold).transform(y.reshape(-1, 1)) datasets = [ ('letter names', letter_names, (0, 13, 26, 39, 52)), # ('letter sounds', letter_sounds, (0, 13, 26)) ] for independent_variable_name, X_data, X_ticks in datasets: # Create linear regression object regr = linear_model.LinearRegression(normalize=True) # Split into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.2)
y = y.flatten() return X, y def get_column_names(path): with open(path) as fp: header = fp.readline().split(',') #[1:-1] return header X, y = get_training_data(data_path) letter_names = X[:, 0].reshape(-1, 1) letter_sounds = X[:, 1].reshape(-1, 1) # Binarize labels y = Binarizer(threshold=fail_threshold).transform(y.reshape(1, -1))[0] reading_data = (X, y) datasets = [reading_data] # points where we want ticks, as well as the label for that tick ticks = [[0, 0], [13, 7], [26, 13], [39, 20], [52, 26]] ticks = np.array(ticks) figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets # LN is X, LS is Y for ds_cnt, ds in enumerate(datasets): # preprocess dataset, split into training and test part