Ejemplo n.º 1
0
 def test_on_animal_df(self):
     """
     Simple example with expected numpy vector to compare to.
     Use fillna mode.
     """
     animal_df = pd.DataFrame({'animal': ['dog', 'cat', 'rat'],
                               'color': ['white', 'brown', 'brown'],
                               'gender': ['F', 'F', np.NaN],
                               'weight': [25, 5, 1],
                               'garbage': [0, 1, np.NaN],
                               'abundance': [0.5, 0.4, 0.1]})
     extracted = bacteriopop_utils.extract_features(
         dataframe=animal_df,
         column_list=['animal', 'color', 'weight', 'abundance'],
         fillna=True
         )
     # check that the column names match what is expected
     self.assertEqual(extracted.columns.tolist(),
                      ['abundance', 'animal=cat', 'animal=dog',
                       'animal=rat', 'color=brown', 'color=white',
                       'weight'])
     # check that the values are what was expected.
     expected_result = np.array([[0.5, 0., 1., 0., 0., 1., 25.],
                                 [0.4, 1., 0., 0., 1., 0., 5.],
                                 [0.1, 0., 0., 1., 1., 0., 1.]])
     self.assertEqual(expected_result.tolist(),
                      extracted.as_matrix().tolist())
Ejemplo n.º 2
0
def dbscan_demo():
    print 'starting up'
    df = load_data()
    print 'load done'
    df = extract_features(df)
    print 'features done'
    dbscan(df, 0.2, 10)
Ejemplo n.º 3
0
def dbscan_demo():
    print 'starting up'
    df = load_data()
    print 'load done'
    df = extract_features(df)
    print 'features done'
    dbscan(df, 0.2, 10)
Ejemplo n.º 4
0
def gmm_demo():
    print 'starting up'
    df = load_data()
    print 'load done'
    df = extract_features(df)
    print 'extract done'
    features_list = list(df.columns.values)[1:]
    print 'features done'
    gmm(df, features_list)
Ejemplo n.º 5
0
def gmm_demo():
    print 'starting up'
    df = load_data()
    print 'load done'
    df = extract_features(df)
    print 'extract done'
    features_list = list(df.columns.values)[1:]
    print 'features done'
    gmm(df, features_list)
Ejemplo n.º 6
0
def main():
    """
    Entry point for all code
    """
    print "starting up"
    df = load_data()
    df_vectorized = extract_features(df, column_list=FEATURES_TO_EXTRACT,
                                     fillna=True, debug=False)
    target_correlation = calculate_features_target_correlation(df_vectorized, df_vectorized.columns.tolist(),
                                                               PREDICTION_TARGET, PCA_METHOD)
    pca = pca_bacteria(df_vectorized, PCA_COMPONENTS)
    return target_correlation, pca
Ejemplo n.º 7
0
import random

import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from load_data import load_data
from bacteriopop_utils import extract_features

DF = load_data()
DF = extract_features(DF)

def get_random_color(pastel_factor=0.5):
    return [(x+pastel_factor)/(1.0+pastel_factor) for x in
            [random.uniform(0, 1.0) for i in [1, 2, 3]]]


def color_distance(c1, c2):
    return sum([abs(x[0]-x[1]) for x in zip(c1, c2)])


def generate_new_color(existing_colors, pastel_factor=0.5):
    max_distance = None
    best_color = None
    for i in range(0, 100):
        color = get_random_color(pastel_factor=pastel_factor)
        if not existing_colors:
            return color
        best_distance = min([color_distance(color, c) for c in
                             existing_colors])
        if not max_distance or best_distance > max_distance:
            max_distance = best_distance
Ejemplo n.º 8
0
import random

import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from load_data import load_data
from bacteriopop_utils import extract_features

DF = load_data()
DF = extract_features(DF)


def get_random_color(pastel_factor=0.5):
    return [(x + pastel_factor) / (1.0 + pastel_factor)
            for x in [random.uniform(0, 1.0) for i in [1, 2, 3]]]


def color_distance(c1, c2):
    return sum([abs(x[0] - x[1]) for x in zip(c1, c2)])


def generate_new_color(existing_colors, pastel_factor=0.5):
    max_distance = None
    best_color = None
    for i in range(0, 100):
        color = get_random_color(pastel_factor=pastel_factor)
        if not existing_colors:
            return color
        best_distance = min(
            [color_distance(color, c) for c in existing_colors])
        if not max_distance or best_distance > max_distance:
Ejemplo n.º 9
0
def gmm_demo():
    df = load_data()
    df = extract_features(df)
    features_list = list(df.columns.values)[1:]
    gmm(df, features_list)
Ejemplo n.º 10
0
def gmm_demo():
    df = load_data()
    df = extract_features(df)
    features_list = list(df.columns.values)[1:]
    gmm(df, features_list)