def test_on_animal_df(self): """ Simple example with expected numpy vector to compare to. Use fillna mode. """ animal_df = pd.DataFrame({'animal': ['dog', 'cat', 'rat'], 'color': ['white', 'brown', 'brown'], 'gender': ['F', 'F', np.NaN], 'weight': [25, 5, 1], 'garbage': [0, 1, np.NaN], 'abundance': [0.5, 0.4, 0.1]}) extracted = bacteriopop_utils.extract_features( dataframe=animal_df, column_list=['animal', 'color', 'weight', 'abundance'], fillna=True ) # check that the column names match what is expected self.assertEqual(extracted.columns.tolist(), ['abundance', 'animal=cat', 'animal=dog', 'animal=rat', 'color=brown', 'color=white', 'weight']) # check that the values are what was expected. expected_result = np.array([[0.5, 0., 1., 0., 0., 1., 25.], [0.4, 1., 0., 0., 1., 0., 5.], [0.1, 0., 0., 1., 1., 0., 1.]]) self.assertEqual(expected_result.tolist(), extracted.as_matrix().tolist())
def dbscan_demo(): print 'starting up' df = load_data() print 'load done' df = extract_features(df) print 'features done' dbscan(df, 0.2, 10)
def gmm_demo(): print 'starting up' df = load_data() print 'load done' df = extract_features(df) print 'extract done' features_list = list(df.columns.values)[1:] print 'features done' gmm(df, features_list)
def main(): """ Entry point for all code """ print "starting up" df = load_data() df_vectorized = extract_features(df, column_list=FEATURES_TO_EXTRACT, fillna=True, debug=False) target_correlation = calculate_features_target_correlation(df_vectorized, df_vectorized.columns.tolist(), PREDICTION_TARGET, PCA_METHOD) pca = pca_bacteria(df_vectorized, PCA_COMPONENTS) return target_correlation, pca
import random import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import DBSCAN from load_data import load_data from bacteriopop_utils import extract_features DF = load_data() DF = extract_features(DF) def get_random_color(pastel_factor=0.5): return [(x+pastel_factor)/(1.0+pastel_factor) for x in [random.uniform(0, 1.0) for i in [1, 2, 3]]] def color_distance(c1, c2): return sum([abs(x[0]-x[1]) for x in zip(c1, c2)]) def generate_new_color(existing_colors, pastel_factor=0.5): max_distance = None best_color = None for i in range(0, 100): color = get_random_color(pastel_factor=pastel_factor) if not existing_colors: return color best_distance = min([color_distance(color, c) for c in existing_colors]) if not max_distance or best_distance > max_distance: max_distance = best_distance
import random import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import DBSCAN from load_data import load_data from bacteriopop_utils import extract_features DF = load_data() DF = extract_features(DF) def get_random_color(pastel_factor=0.5): return [(x + pastel_factor) / (1.0 + pastel_factor) for x in [random.uniform(0, 1.0) for i in [1, 2, 3]]] def color_distance(c1, c2): return sum([abs(x[0] - x[1]) for x in zip(c1, c2)]) def generate_new_color(existing_colors, pastel_factor=0.5): max_distance = None best_color = None for i in range(0, 100): color = get_random_color(pastel_factor=pastel_factor) if not existing_colors: return color best_distance = min( [color_distance(color, c) for c in existing_colors]) if not max_distance or best_distance > max_distance:
def gmm_demo(): df = load_data() df = extract_features(df) features_list = list(df.columns.values)[1:] gmm(df, features_list)