def search_cluster(cluster, uhistdict): ''' Given a cluster, find the true entry. ''' cluster_centre = collections.OrderedDict() field_name = [] field_clarity = [] field_altdist = [] for col in cluster.columns: if col=='Id': # Skip the unique ID continue # Take the histogram of entries in the cluster x,y = utils.uniquehist(cluster[col].values, uhistdict[col][0]) # Normalise y so it is a distribution y = y/sum(y) # Find the index which occurs more than you'd expect by chance idx = np.argmax(y - uhistdict[col][1]) # This value is the value of the true datapoint for this column cluster_centre[col] = uhistdict[col][0][idx] # Remember how obvious the field was sort_idx = np.argsort(y - uhistdict[col][1]) field_name.append(col) field_clarity.append((y[sort_idx[-1]] - uhistdict[col][1][sort_idx[-1]]) / (y[sort_idx[-2]] - uhistdict[col][1][sort_idx[-2]])) field_altdist.append(y[sort_idx[-2]] - uhistdict[col][1][sort_idx[-2]]) return cluster_centre, field_name, field_clarity, field_altdist
# Main ######################### print('Loading the raw test data') # Load the 100,000 strong test data test = pd.read_csv('data/test.csv') # Add an Age column end_dt = datetime.datetime.strptime('2015-1-1', "%Y-%m-%d") test['Age'] = [(end_dt - datetime.datetime.strptime(open_dt, "%m/%d/%Y")).days for open_dt in test['Open Date']] # Build a dictionary of occurances of each element for each column uhistdict = {} for col in test.columns: x,y = utils.uniquehist(test[col].values) uhistdict[col] = (x,y/sum(y)) # Will use the highest entropy column: Opening Date (equivalent to Age) # This is almost a unique identifier for the restaurant trigger_column = 'Age' unique_triggers = uhistdict[trigger_column][0] # Note how many entries we expect to see # If there is more than 50% more than this, there are probably two the same expected_num_entries = len(test) * np.median(uhistdict[trigger_column][1]) genuinetestmap = collections.OrderedDict() genuinetestdict = collections.OrderedDict() for i,trigger in enumerate(unique_triggers):