Esempio n. 1
0
def search_cluster(cluster, uhistdict):
    '''
    Given a cluster, find the true entry.
    '''
    cluster_centre = collections.OrderedDict()
    field_name = []
    field_clarity = []
    field_altdist = []
    for col in cluster.columns:
        if col=='Id':
            # Skip the unique ID
            continue
        # Take the histogram of entries in the cluster
        x,y = utils.uniquehist(cluster[col].values, uhistdict[col][0])
        # Normalise y so it is a distribution
        y = y/sum(y)
        # Find the index which occurs more than you'd expect by chance
        idx = np.argmax(y - uhistdict[col][1])
        # This value is the value of the true datapoint for this column
        cluster_centre[col] = uhistdict[col][0][idx]
        
        # Remember how obvious the field was
        sort_idx = np.argsort(y - uhistdict[col][1])
        field_name.append(col)
        field_clarity.append((y[sort_idx[-1]] - uhistdict[col][1][sort_idx[-1]]) /
                        (y[sort_idx[-2]] - uhistdict[col][1][sort_idx[-2]]))
        field_altdist.append(y[sort_idx[-2]] - uhistdict[col][1][sort_idx[-2]])
    
    return cluster_centre, field_name, field_clarity, field_altdist
Esempio n. 2
0
# Main
#########################

print('Loading the raw test data')

# Load the 100,000 strong test data
test = pd.read_csv('data/test.csv')

# Add an Age column
end_dt = datetime.datetime.strptime('2015-1-1', "%Y-%m-%d")
test['Age']  = [(end_dt - datetime.datetime.strptime(open_dt, "%m/%d/%Y")).days for open_dt in test['Open Date']]

# Build a dictionary of occurances of each element for each column
uhistdict = {}
for col in test.columns:
    x,y = utils.uniquehist(test[col].values)
    uhistdict[col] = (x,y/sum(y))

# Will use the highest entropy column: Opening Date (equivalent to Age)
# This is almost a unique identifier for the restaurant
trigger_column = 'Age'
unique_triggers = uhistdict[trigger_column][0]

# Note how many entries we expect to see
# If there is more than 50% more than this, there are probably two the same
expected_num_entries = len(test) * np.median(uhistdict[trigger_column][1])

genuinetestmap = collections.OrderedDict()
genuinetestdict = collections.OrderedDict()

for i,trigger in enumerate(unique_triggers):