Beispiel #1
0
def remove_and_correct_outliers(data):
    ##is data in a normal distribution??
    b_constant = 1.4826  ##constant used for normal distribution
    factor = 10  #3 ##factor to multiply for the range
    for i in range(0, len(data[0].values)
                   ):  ##iterate through all features, in voce case 6125
        d_s, d_ns, _, _ = utils.get_utterance_values_of_ith_utterance(
            data, i)  ##get all feature values
        d = d_s + d_ns  ##join them together, since the fucntion returns different arrays for stress or not stress
        f_vals = np.array(d, dtype=float)  ##transform list into np array
        median = np.median(f_vals)  ##get the median
        diff = (
            f_vals - median
        )**2  ##subtract median to every element and **2 to get all values to positive
        diff = np.sqrt(diff)  ## eliminate the **2 trick to avoid negatives
        med_abs_deviation = np.median(diff)  ##get the new mean
        threshold = med_abs_deviation * b_constant  ##raange of value to be accepted
        max_range = median + threshold * factor
        min_range = median - threshold * factor
        for j in range(
                0, len(f_vals)
        ):  ##mark values that are outside the bounderies as outliers
            if f_vals[j] < min_range or f_vals[j] > max_range:
                f_vals[j] = np.nan
        imp = Imputer(missing_values=np.nan, strategy='mean', axis=1)
        f_vals = imp.fit_transform(f_vals)[0]
        for j in range(0, len(f_vals)):
            data[j].values[i] = round(f_vals[j], 6)
    return data
Beispiel #2
0
def remove_and_correct_outliers(data):
    ##is data in a normal distribution??
    b_constant = 1.4826  ##constant used for normal distribution
    factor = 10 #3 ##factor to multiply for the range
    count = 0
    for i in range(0, len(data[0].values)):  ##iterate through all features, in voce case 6125
        d_s, d_ns, _, _ = utils.get_utterance_values_of_ith_utterance(data, i)  ##get all feature values
        d = d_s + d_ns ##join them together, since the fucntion returns different arrays for stress or not stress
        f_vals = np.array(d, dtype=float) ##transform list into np array
        median = np.median(f_vals) ##get the median
        diff = (f_vals - median)**2 ##subtract median to every element and **2 to get all values to positive
        diff = np.sqrt(diff) ## eliminate the **2 trick to avoid negatives
        med_abs_deviation = np.median(diff) ##get the new mean
        threshold = med_abs_deviation * b_constant ##raange of value to be accepted
        max_range = median + threshold * factor
        min_range = median - threshold * factor
        for j in range(0, len(f_vals)):  ##mark values that are outside the bounderies as outliers
            if f_vals[j] < min_range or f_vals[j] > max_range:
                count += 1
                f_vals[j] = np.nan
        imp = Imputer(missing_values=np.nan, strategy='mean', axis=1)
        f_vals = imp.fit_transform(f_vals)[0]
        for j in range(0, len(f_vals)):
            data[j].values[i] = round(f_vals[j],6)
    print "Detected ", count, " outliers"
    return data
Beispiel #3
0
def filter_features(data):
    features = {}    
    useless_features = []
    bins = [50, 100, 250, 500, 1000]
    mis = [[],[],[],[],[]]
    
    for index in range(0,6125): ##iterate through all 6125 features
         
        feature_values_no_stress, feature_values_stress, min, max = utils.get_utterance_values_of_ith_utterance(data, index)
        
        if max - min == 0:
            useless_features.append(index)
        
        for i in range(0,len(bins)):
            mi = calculate_mi(feature_values_no_stress, feature_values_stress, bins[i])
            mis[i].append(mi)
            if i == 0:
                features[index] = []
            features[index].append(mi)
    
    ##thresholds for each bin
    thresholds = []
    for m in mis:
        t = np.percentile(m, 75) # return 75th percentile
        thresholds.append(t)
    
    ##create lis to check if feature was selected for each bin
    selections = [[],[],[],[],[]]
    fts = []
    hist = []
    for i in range(0,6125):
        fts.append(0)
        for j in range(0, len(bins)):
            if features[i][j] >= thresholds[j]:
                selections[j].append(1)
                fts[i] += 1
                hist.append(i)
                
            else:
                selections[j].append(0)
    
    most_selected_fts = [] ##stores the utterances that were selected in all tests
    
    ##mudar para fazer histograma pela contagem e nao pela frequencia
    hist_scatter = {}
    for i in range(0,6125): ##initiates list with all 0
        hist_scatter[i] = 0
    for i in hist:
        hist_scatter[i] += 1
    
    for i in hist_scatter:
        if hist_scatter[i] == 5:
            most_selected_fts.append(i)
    
    return most_selected_fts
    
    
        
        
Beispiel #4
0
def filter_features(data):
    features = {}
    useless_features = []
    bins = [50, 100, 250, 500, 1000]
    mis = [[], [], [], [], []]

    for index in range(0, 6125):  ##iterate through all 6125 features

        feature_values_no_stress, feature_values_stress, min, max = utils.get_utterance_values_of_ith_utterance(
            data, index)

        if max - min == 0:
            useless_features.append(index)

        for i in range(0, len(bins)):
            mi = calculate_mi(feature_values_no_stress, feature_values_stress,
                              bins[i])
            mis[i].append(mi)
            if i == 0:
                features[index] = []
            features[index].append(mi)

    ##thresholds for each bin
    thresholds = []
    for m in mis:
        t = np.percentile(m, 75)  # return 75th percentile
        thresholds.append(t)

    ##create lis to check if feature was selected for each bin
    selections = [[], [], [], [], []]
    fts = []
    hist = []
    for i in range(0, 6125):
        fts.append(0)
        for j in range(0, len(bins)):
            if features[i][j] >= thresholds[j]:
                selections[j].append(1)
                fts[i] += 1
                hist.append(i)

            else:
                selections[j].append(0)

    most_selected_fts = [
    ]  ##stores the utterances that were selected in all tests

    ##mudar para fazer histograma pela contagem e nao pela frequencia
    hist_scatter = {}
    for i in range(0, 6125):  ##initiates list with all 0
        hist_scatter[i] = 0
    for i in hist:
        hist_scatter[i] += 1

    for i in hist_scatter:
        if hist_scatter[i] == 5:
            most_selected_fts.append(i)

    return most_selected_fts