def distance_array(self, separate): """Returns a list containing the distances from each email to the center.""" train_examples = self.active_unlearner.driver.tester.train_examples if separate: if self.working_set is None: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3]) if train.train in self.train] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set if train.train in self.train] assert(len(dist_list) > 0) else: if self.working_set is None: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3])] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set] if self.sort_first: dist_list.sort() return dist_list
def weighted_initial(self, working_set, mislabeled): if mislabeled is None: # Note that mislabeled is sorted in descending order by fabs(.50-email.prob) mislabeled = self.get_mislabeled() t_e = self.driver.tester.train_examples print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) possible_centroids = list(mislabeled - self.mislabeled_chosen) print len(possible_centroids), " mislabeled emails remaining as possible cluster centroids" if len(possible_centroids) == 0: #No more centers to select return NO_CENTROIDS else: possible_centroids.sort(key=lambda x: fabs(.50-x.prob), reverse=True) mislabeled_point = possible_centroids[0] # Choose most potent mislabeled email self.mislabeled_chosen.add(mislabeled_point) print "Chose the mislabeled point: ", mislabeled_point.tag print "Probability: ", mislabeled_point.prob init_email = None training = chain(t_e[0], t_e[1], t_e[2], t_e[3]) if working_set is None else working_set if "frequency" in self.distance_opt: min_distance = sys.maxint mislabeled_point_frequencies = helpers.get_word_frequencies(mislabeled_point) for email in training: current_distance = distance(email, mislabeled_point_frequencies, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance elif self.distance_opt == "intersection": min_distance = -1 for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance > min_distance: init_email = email min_distance = current_distance else: min_distance = sys.maxint for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance print type(init_email) if init_email is None: print "Training emails remaining: ", training else: print "-> selected ", init_email.tag, " as cluster centroid with distance of ", min_distance, " from mislabeled point" return init_email
def testDistance(self): v="PLEASANTLY" w="MEANLY" d,vv,ww = distance(v, w) print(vv) print(ww) assert d == 5
def invite_customer(file_name, invite_dist): #list that will contain all the customers who would be invited customer_list = [] try: #opening the file containing customer list in read mode with open(file_name, "r") as fp: #fetching the records line by line for cust in fp: # converting the text into json for easier access cust_json = json.loads(cust) #calling the distance function to calculate the actual distance by passing the customer positions dist = distance(float(cust_json['latitude']), float(cust_json['longitude'])) if dist < invite_dist: customer_list.append( [cust_json['user_id'], cust_json['name'], dist]) #Calling the the print function to print the customer list except (OSError, IOError): print("unable to read the file {}".format(file_name)) else: #closing the file fp.close() return customer_list
def ByWhat3Words(): form = What3WordsForm(request.form) if request.method == 'POST' and form.validate(): geocoder = what3words.Geocoder("R4IPMCP6") try: data1 = geocoder.convert_to_coordinates(form.W3WAddress1.data) data2 = geocoder.convert_to_coordinates(form.W3WAddress2.data) lat1 = float(data1['coordinates']['lat']) lng1 = float(data1['coordinates']['lng']) lat2 = float(data2['coordinates']['lat']) lng2 = float(data2['coordinates']['lng']) dis = str(round(distance(lat1, lng1, lat2, lng2),2)) appID = "QV50Cg9nKusKIxU0xuxn" appCode = "MtWxs2XaYo4z_X8jc1n_9Q" imageurl = "https://image.maps.api.here.com/mia/1.6/route?r0=" + str(lat1) + "%2C" + str(lng1) + "%2C" + str(lat2) + "%2C" + str(lng2) + "&m0=" + str(lat1) + "%2C" + str(lng1) + "%2C" + str(lat2) + "%2C" + str(lng2) + "&lc0=dc85ff&sc0=000000&lw0=6&w=500&app_id=" + appID + "&app_code=" + appCode return render_template('ByWhat3Words.html', form=form, dis = dis, url = imageurl) except: traceback.print_exc() return render_template('ByWhat3Words.html', form=form, dis = "Error - please try again") else: return render_template('ByWhat3Words.html', form=form, dis = "") return render_template('ByWhat3Words.html', form=form)
def mislabeled_initial(self, working_set, mislabeled): """Chooses an arbitrary point from the mislabeled emails and returns the training email closest to it.""" if mislabeled is None: mislabeled = self.get_mislabeled() t_e = self.driver.tester.train_examples print "Total Chosen: ", len(self.mislabeled_chosen) try: mislabeled_point = choice(list(mislabeled - self.mislabeled_chosen)) self.mislabeled_chosen.add(mislabeled_point) except: raise AssertionError(str(mislabeled)) min_distance = sys.maxint init_email = None training = chain(t_e[0], t_e[1], t_e[2], t_e[3]) if working_set is None else working_set for email in training: current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance return init_email
def testFittingDistance(self): v="TAGGCTTA" w="TAGATA" d,vv,ww = distance(v, w) print(vv) print(ww) assert d == 5
def find_nearest_neighbours(p: np.array, points: np.array, k: int = 5) -> np.array: """ Find the k nearest neighbours of point p in points and return them. """ distances = np.zeros(points.shape[0]) for i in range(len(points)): distances[i] = distance(p, points[i]) indices = np.argsort(distances) return indices[:k]
def distance_array(self): train_examples = self.active_unlearner.driver.tester.train_examples if self.working_set is None: """ for i in range(len(self.active_unlearner.driver.tester.train_examples)): for train in self.active_unlearner.driver.tester.train_examples[i]: if train != self.clustroid: dist_list.append((distance(self.clustroid, train, self.opt), train)) """ dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3])] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set] if self.sort_first: dist_list.sort() return dist_list
def DistRank_F(address): zip_code = [ '20001', '20002', '20003', '20004', '20005', '20006', '20007', '20008', '20009', '20010', '20011', '20012', '20015', '20016', '20017', '20018', '20019', '20020', '20024', '20032', '20036', '20037', '22201', '22202', '22203', '22204', '22205', '22206', '22207', '22209', '22211', '22213', '22214', '22301', '22302', '22304', '22305', '22311', '22314' ] # three types of isochrone types = ["driving", "cycling", "walking"] # time of isochrone times = [10, 30, 60] # add isochrone area to dataframe df = pd.DataFrame(zip_code, columns={"Zip Code"}) from geopy.geocoders import Nominatim # create a random name of agent so that the service will not time out agent = "distance" + str(random.randint(0, 100)) geolocator = Nominatim(user_agent=agent) location = geolocator.geocode(address) # loc is the location of address loc = (location.longitude, location.latitude) # zi_p is the zip code of address zi_p = (str.split(location.address, ",")[-2]).lstrip() for i in range(len(types)): ty_pe = types[i] for j in range(len(times)): time = times[j] name = 'Ranking of '+str(types[i]) + \ " "+str(times[j])+" "+"of Family" li_st = [] for k in range(len(zip_code)): li_st.append(distance(loc, zip_code[k], ty_pe, time)) df[name] = li_st df[name] = ranking(df[name]) df.loc[df["Zip Code"] == zi_p, name] = 5 name_col = str(ty_pe.title()) + " Area of Family" cols = [] for m in df.columns: if ((ty_pe in str.split(m, " ")) and ("Family" in str.split(m, " "))): cols.append(m) df[name_col] = df[cols].apply(lambda x: x.sum(), axis=1) df[name_col] = ranking(df[name_col]) return df
def distance_array(self, separate): """Returns a list containing the distances from each email to the center.""" train_examples = self.active_unlearner.driver.tester.train_examples if separate: # if true, all emails must be same type (spam or ham) as centroid if self.working_set is None: if "frequency" in self.opt: print " Creating Distance Array using frequency method" dist_list = [(distance(train, self.cluster_word_frequency, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3]) if train.train in self.train] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3]) if train.train in self.train] else: if "frequency" in self.opt: print " Creating Distance Array using frequency method" dist_list = [(distance(train, self.cluster_word_frequency, self.opt), train) for train in self.working_set if train.train in self.train] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set if train.train in self.train] assert (len(dist_list) > 0) else: if self.working_set is None: dist_list = [ (distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3]) ] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set] if self.sort_first: dist_list.sort( ) # sorts tuples by first element default, the distance if self.opt == "intersection": dist_list = dist_list[::-1] return dist_list # reverse the distance list so that closest element is at start print "\n ----------------Generated Distance Array----------------\n" print[email[0] for email in dist_list[:5]] return dist_list
def cluster_au(au, gold=False, pos_cluster_opt=0): """Clusters the training space of an ActiveUnlearner and returns the list of clusters.""" print "\n-----------------------------------------------------\n" cluster_list = [] training = au.shuffle_training() print "\nResetting mislabeled...\n" mislabeled = au.get_mislabeled(update=True) au.mislabeled_chosen = set() print "\nClustering...\n" original_training_size = len(training) while len(training) > 0: print "\n-----------------------------------------------------\n" print "\n" + str(len(training)) + " emails out of " + str(original_training_size) + \ " still unclustered. TEST1\n" current_seed = cluster_methods(au, "mislabeled", training, mislabeled) pre_cluster_rate = au.current_detection_rate # Sort TRAINING w.r.t. seed sorted_list = [(distance(current_seed, train, "inv-match"), train) for train in training] sorted_list.sort(key=operator.itemgetter(0)) #print "\n\n\nSorted List\n\n\n" #print sorted_list #print "\n\n\nEND\n\n\n" cluster_result = determine_cluster(current_seed, au, working_set=training, gold=gold, impact=True, pos_cluster_opt=pos_cluster_opt) #while cluster_result is None: #current_seed = cluster_methods(au, "mislabeled", training, mislabeled) #cluster_result = determine_cluster(current_seed, au, working_set=training, gold=gold, impact=True, # pos_cluster_opt=pos_cluster_opt) net_rate_change, cluster = cluster_result post_cluster_rate = au.current_detection_rate assert(post_cluster_rate == pre_cluster_rate), str(pre_cluster_rate) + " " + str(post_cluster_rate) cluster_list.append([net_rate_change, cluster]) print "\nRemoving cluster from shuffled training set...\n" for email in cluster.cluster_set: training.remove(email) cluster_list.sort() print "\nClustering process done and sorted.\n" return cluster_list
def row_sum_initial(self, working_set, mislabeled): """Returns the email with the smallest row sum from the set of mislabeled emails.""" if mislabeled is None: mislabeled = self.get_mislabeled() t_e = self.driver.tester.train_examples minrowsum = sys.maxint init_email = None training = chain(t_e[0], t_e[1], t_e[2], t_e[3]) if working_set is None else working_set for email in training: rowsum = 0 for email2 in mislabeled: dist = distance(email, email2, self.distance_opt) rowsum += dist ** 2 if rowsum < minrowsum: minrowsum = rowsum init_email = email return init_email
def distance_array(self, separate): """Returns a list containing the distances from each email to the center.""" train_examples = self.active_unlearner.driver.tester.train_examples if separate: # if true, all emails must be same type (spam or ham) as centroid if self.working_set is None: if "frequency" in self.opt: print " Creating Distance Array using frequency method" dist_list = [(distance(train, self.cluster_word_frequency, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3]) if train.train in self.train] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3]) if train.train in self.train] else: if "frequency" in self.opt: print " Creating Distance Array using frequency method" dist_list = [(distance(train, self.cluster_word_frequency, self.opt), train) for train in self.working_set if train.train in self.train] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set if train.train in self.train] assert(len(dist_list) > 0) else: if self.working_set is None: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3])] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set] if self.sort_first: dist_list.sort() # sorts tuples by first element default, the distance if self.opt == "intersection": dist_list = dist_list[::-1] return dist_list # reverse the distance list so that closest element is at start print "\n ----------------Generated Distance Array----------------\n" print [email[0] for email in dist_list[:5]]
def ByCoordinate(): form = CoordinateForm(request.form) if request.method == 'POST' and form.validate(): try: lat1 = float(form.Latitude1.data) * float(pi) / float(180) lon1 = float(form.Longitude1.data) * float(pi) / float(180) lat2 = float(form.Latitude2.data) * float(pi) / float(180) lon2 = float(form.Longitude2.data) * float(pi) / float(180) dis = str(round(distance(lat1, lon1, lat2, lon2),2)) appID = "QV50Cg9nKusKIxU0xuxn" appCode = "MtWxs2XaYo4z_X8jc1n_9Q" imageurl = "https://image.maps.api.here.com/mia/1.6/route?r0=" + str(form.Latitude1.data) + "%2C" + str(form.Longitude1.data) + "%2C" + str(form.Latitude2.data) + "%2C" + str(form.Longitude2.data) + "&m0=" + str(form.Latitude1.data) + "%2C" + str(form.Longitude1.data) + "%2C" + str(form.Latitude2.data) + "%2C" + str(form.Longitude2.data) + "&lc0=dc85ff&sc0=000000&lw0=6&w=500&app_id=" + appID + "&app_code=" + appCode return render_template('ByCoordinate.html', form=form, dis = dis, url = imageurl) except: traceback.print_exc() return render_template('ByCoordinate.html', form=form, dis = "Error - please try again") else: return render_template('ByCoordinate.html', form=form, dis = "")
def test_four(self): res = distance(-1, 1, 1, -1) self.assertEqual(res, 8**0.5)
def test_three(self): res = distance(1, 1, 1, 1) self.assertEqual(res, 0)
def test_two(self): res = distance(0, 0, 1, 1) self.assertEqual(res, 2**0.5)
# this is redundant since current version is 2-D but required since distance and area functions are generic 3-D tmp_z_pos = (dim - 2) * rand.uniform(z_part_domain[0], z_part_domain[1]) # first particle if global_count == 0: particles_global[global_count, 2] = tmp_x_pos particles_global[global_count, 3] = tmp_y_pos global_count = global_count + 1 count = count + 1 else: # check for overlap with all previous particles dist_bol = distance(dim,global_count,particles_global,tmp_x_pos,tmp_y_pos,tmp_z_pos,dist_tol, \ x_part_domain_len,y_part_domain_len,z_part_domain_len) #print(dist_bol) if dist_bol == True: particles_global[global_count, 2] = tmp_x_pos particles_global[global_count, 3] = tmp_y_pos if dim == 3: particles_global[global_count, 4] = tmp_z_pos global_count = global_count + 1 count = count + 1 direction = 1 area_fraction(direction, dim, Dp, particles_global, x_part_domain,
def distance_array(self, separate): """Returns a list containing the distances from each email to the center.""" train_examples = self.active_unlearner.driver.tester.train_examples if separate: if self.working_set is None: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3]) if train.train in self.train] else: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set if train.train in self.train] assert(len(dist_list) > 0) else: if self.working_set is None: dist_list = [(distance(self.clustroid, train, self.opt), train) for train in chain(train_examples[0], train_examples[1], train_examples[2], train_examples[3])] else: if self.moving: dist_list = [] for index, train in enumerate(self.working_set): if index + 1 < len(self.working_set): dist1 = distance(self.clustroid, self.working_set[index], self.opt) dist2 = distance(self.clustroid, self.working_set[index + 1], self.opt) # if top of queue isn't closest to new centroid, take the next item as the closest and # swap their positions in the queue if dist1 < dist2: temp = dist2 dist2 = dist1 dist1 = temp temp = self.working_set[index + 1] self.working_set[index + 1] = self.working_set[index] self.working_set[index] = temp dist_list.append((dist1, train)) # get common feature space - make zombie commonFeatureSpace = zombify(self.clustroid, self.working_set[index]) self.clustroid.clues = commonFeatureSpace else: dist1 = distance(self.clustroid, self.working_set[index], self.opt) dist_list.append((dist1, train)) # get common feature space - make zombie commonFeatureSpace = zombify(self.clustroid, self.working_set[index]) self.clustroid.clues = commonFeatureSpace dist_list = [(distance(self.clustroid, train, self.opt), train) for train in self.working_set] if self.sort_first: dist_list.sort() return dist_list
def update_dist_list(self, separate=True): """Updates self.dist_list for the frequency[1,2] method""" emails = [train[1] for train in self.dist_list] # get array of emails self.dist_list = [(distance(train, self.cluster_word_frequency, self.opt), train) for train in emails] self.dist_list.sort()
def test_zero(self): res = distance(0, 0, 0, 0) self.assertEqual(res, 0)
def chosen_sum(chosen, x, opt=None): s = 0 for msg in chosen: s += distance(msg, x, opt) return s
def webhook(): message = request.get_json() #messages from users are fetched log(message) print(message) if message['object'] == 'page': for entry in message['entry']: for messaging_event in entry['messaging']: #Extracting all IDs sender_id = messaging_event['sender']['id'] recipient_id = messaging_event['recipient']['id'] if messaging_event.get('message'): if 'text' in messaging_event['message']: messaging_text = messaging_event['message']['text'] else: messaging_text = 'NoText' # ECHO response = None entities, values, dictionary_of_values_and_entities = wit_response( messaging_text) for entity in entities: if entity == 'get_class': response = classes(entities, entity, values) if response == None: response = "Sorry! I can not find this course..." # response = "OK! This are your classes of {}: ".format(str(values[entities.index(entity)])) elif entity == 'feeling': response = feeling(entities, entity, values) elif entity == 'mood': response = mood(entities, entity, values) elif entity == 'greetings': response = hello(entities, entity, values) elif entity == 'bye': response = buy(entities, entity, values) elif entity == 'currency_1': response = amount_of_money(entities, entity, values) elif entity == 'docfinder': response = docfind(entities, entity, values) elif entity == 'mensa_hours': response = mensa_hours(entities, entity, values) elif entity == 'useful_inf_for_inc': response = useful_inf_for_inc( entities, entity, values) elif entity == 'fh': response = fh(entities, entity, values) elif entity == 'before_leaving': response = before_leaving(entities, entity, values) elif entity == 'tips': response = tips(entities, entity, values) elif entity == 'because_a': response = because_a(entities, entity, values) elif entity == 'fact': response = facts(entities, entity, values) elif entity == 'joke': response = jokes(entities, entity, values) elif entity == 'ok': response = ok(entities, entity, values) elif entity == 'weather': response = weather(entities, entity, values) elif entity == 'NeedHelp': response = needhelp(entities, entity, values) elif entity == 'no': response = no(entities, entity, values) elif entity == 'Start': response = start(entities, entity, values) elif entity == 'Stop': response = stop(entities, entity, values) elif entity == 'event': response = events(entities, entity, values) elif entity == 'game': response = games(entities, entity, values) elif entity == 'math': response = calc(entities, entity, values) elif entity == 'love_q': response = love_q(entities, entity, values) elif entity == 'hobby': response = hobby(entities, entity, values) elif entity == 'remind': response = remind(entities, entity, values) elif entity == 'google': response = search_g(entities, entity, values) elif entity == 'notable_person': response = person(entities, entity, values) elif entity == 'duration': response = duration(entities, entity, values) elif entity == 'amount_of_money': response = amount_of_money(entities, entity, values) elif entity == 'thanks': response = thanks(entities, entity, values) elif entity == 'SayThanks': response = saythanks(entities, entity, values) elif entity == 'local_search': response = local_search(entities, entity, values) elif entity == 'distance': response = distance(entities, entity, values) if response == None: response = "Sorry! I didn't understand your message..." bot.send_text_message(sender_id, response) return "ok", 200
def test_one(self): res = distance(0, 0, 0, 1) self.assertEqual(res, 1)
def select_initial(self, option="mislabeled", distance_opt = "extreme"): """ Returns an email to be used as the initial unlearning email based on the mislabeled data (our tests show that the mislabeled and pollutant emails are strongly, ~80%, correlated) if option is true (which is default).""" mislabeled = self.get_mislabeled() t_e = self.driver.tester.train_examples print "Chosen: ", self.mislabeled_chosen print "Total Chosen: ", len(self.mislabeled_chosen) if option == "rowsum": # We want to minimize the distances (rowsum) between the email we select # and the mislabeled emails. This ensures that the initial email we select # is correlated with the mislabeled emails. minrowsum = sys.maxint init_email = None for email in chain(t_e[0], t_e[1], t_e[2], t_e[3]): rowsum = 0 for email2 in mislabeled: dist = distance(email, email2, distance_opt) rowsum += dist ** 2 if rowsum < minrowsum: minrowsum = rowsum init_email = email return init_email if option == "mislabeled": # This chooses an arbitrary point from the mislabeled emails and simply finds the email # in training that is closest to this point. try: mislabeled_point = choice(list(mislabeled - self.mislabeled_chosen)) self.mislabeled_chosen.add(mislabeled_point) except: raise AssertionError(str(mislabeled)) min_distance = sys.maxint for email in chain(t_e[0], t_e[1], t_e[2], t_e[3]): current_distance = distance(email, mislabeled_point, distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance return init_email if option == "max_sum": try: max_sum = 0 for email in chain(t_e[0], t_e[1], t_e[2], t_e[3]): current_sum = chosen_sum(self.training_chosen, email, distance_opt) if current_sum > max_sum: init_email = email max_sum = current_sum self.training_chosen.add(init_email) return init_email except: print "Returning initial seed based off of mislabeled...\n" return self.select_initial(option="mislabeled")
def weighted_initial(self, working_set, mislabeled): if mislabeled is None: # Note that mislabeled is sorted in descending order by fabs(.50-email.prob) mislabeled = self.get_mislabeled() t_e = self.driver.tester.train_examples print "Total Cluster Centroids Chosen: ", len(self.mislabeled_chosen) possible_centroids = list(mislabeled - self.mislabeled_chosen) print len( possible_centroids ), " mislabeled emails remaining as possible cluster centroids" if len(possible_centroids) == 0: #No more centers to select return NO_CENTROIDS else: possible_centroids.sort(key=lambda x: fabs(.50 - x.prob), reverse=True) mislabeled_point = possible_centroids[ 0] # Choose most potent mislabeled email self.mislabeled_chosen.add(mislabeled_point) print "Chose the mislabeled point: ", mislabeled_point.tag print "Probability: ", mislabeled_point.prob init_email = None training = chain(t_e[0], t_e[1], t_e[2], t_e[3]) if working_set is None else working_set if "frequency" in self.distance_opt: min_distance = sys.maxint mislabeled_point_frequencies = helpers.get_word_frequencies( mislabeled_point) for email in training: current_distance = distance(email, mislabeled_point_frequencies, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance elif self.distance_opt == "intersection": min_distance = -1 for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance > min_distance: init_email = email min_distance = current_distance else: min_distance = sys.maxint for email in training: # select closest email to randomly selected mislabeled test email current_distance = distance(email, mislabeled_point, self.distance_opt) if current_distance < min_distance: init_email = email min_distance = current_distance print type(init_email) if init_email is None: print "Training emails remaining: ", training else: print "-> selected ", init_email.tag, " as cluster centroid with distance of ", min_distance, " from mislabeled point" return init_email
def chosen_sum(chosen, x, opt=None): """Given a given msg and a set of chosen emails, returns the sum of distances from the given msg.""" s = 0 for msg in chosen: s += distance(msg, x, opt) return s
def test_distance(self): test_dist = distance(54.180238,-5.920898) self.assertEqual(int(test_dist), 96)