def _identify_partial_matches(self, cohort_id, last_name): """Search for partial matches to identify students. Pulls data from a special set of memcache keys, which are updated by cron, and provide the names of all students in the school. All the names are examined to see if the typed name is contained in or contained by the existing name ("containment matching"), which are considered partial matches. Then the matches are ordered by their similarity (Levenshtein distance) to the typed name. """ stripped_last_name = util.clean_string(last_name) match_data, from_memcache = self.internal_api.get_roster(cohort_id) # White list necessary properties (no sense in releasing status codes # like 'Parent Refusal' to the public). def clean_properties(d): white_list = [ 'first_name', 'last_name', 'classroom_name', 'id', 'stripped_last_name' ] return {k: v for k, v in d.items() if k in white_list} # Containment matching. matches = [ clean_properties(u) for u in match_data if u['stripped_last_name'] in stripped_last_name or stripped_last_name in u['stripped_last_name'] ] # Order by edit (Levenshtein) distance from the submitted name. sort_func = lambda n: util.levenshtein_distance( n['stripped_last_name'], stripped_last_name) return sorted(matches, key=sort_func)
def get_contributors_list(self): """ Creates a list of dicts containing the collaborators for the repository. This list is aggregated based following the algorithm on Valente's 2016 paper "A Novel Approach for Estimating Truck Factor", namely step #2 "Detect Developer Aliases. The collaborators are indexed by email and the names is a list of developer aliases that the user might have. The user is considered the same user if the leveshtein distance for the two users names is equal or less than one. Returns: list: A list of dictionaries containing the fields 'email' and 'dev_aliases'. """ contributors_list = [] if self.api_repository is not None: for collaborator in self.api_repository.get_collaborators(): email = collaborator.email name = collaborator.name list_entry = { self.EMAIL_FIELD: email, self.DEV_ALIASES: [name] } fresh_user = True for contributor in contributors_list: for alias in contributor.dev_aliases: if levenshtein_distance(alias, name) <= 1: contributor.dev_aliases.append(name) fresh_user = False break if not fresh_user: break if fresh_user: contributors_list.append(list_entry) return contributors_list
def test_saturday_sunday(self): """compare Saturday with Sunday""" self.assertEquals(3, util.levenshtein_distance('Sunday', 'Saturday'))
def test_kitten_sitting(self): """compare kitten with sitting""" self.assertEquals(3, util.levenshtein_distance('sitting', 'kitten'))
def distance(a, b): return levenshtein_distance(a, b) / float(max(len(a), len(b), 1))
def test(): # Getting settings from config.py max_len = cfg.MAX_TOKEN_LEN num_token = cfg.NUM_OF_TOKEN imw = cfg.IMW imh = cfg.IMH # Training params is_train = False batch_size = 1 # Tracking/Saving num_ite_to_log = cfg.NUM_ITE_TO_LOG num_ite_to_vis = cfg.NUM_ITE_TO_VIS save_name = cfg.SAVE_NAME test_name = cfg.TEST_NAME vis_path = cfg.VIS_PATH use_cuda = cfg.CUDA and torch.cuda.is_available() save_path = cfg.MODEL_FOLDER dataset_path = cfg.DATASET_PATH + 'CROHME2013_data/TestINKML/' scale_factor = cfg.TEST_SCALE_FACTOR # Load the vocab dictionary for display purpose word_to_id, id_to_word = get_gt.build_vocab('mathsymbolclass.txt') start_id = word_to_id['<s>'] stop_id = word_to_id['</s>'] # Initialize the network and load its weights net = AGRU() save_files = glob.glob(save_path + save_name + '*.dat') if (len(save_files) > 0): save_file = sorted(save_files)[-1] print('Loading network weights saved at %s...' % save_file) loadobj = torch.load(save_file) net.load_state_dict(loadobj['state_dict']) print('Loading done.') if (use_cuda): net.cuda() # For debugging if (not is_train): net.train(False) # Get full paths to test inkml files, create a list of scale factors to be used for rendering test images inkml_list = glob.glob(dataset_path + '*.inkml') scale_list = [scale_factor] * len(inkml_list) inkml_list = np.asarray(inkml_list) scale_list = np.asarray(scale_list) #inkml_list = inkml_list[0:120] #scale_list = scale_list[0:120] num_test = len(inkml_list) num_ite = int(np.ceil(1.0 * num_test / batch_size)) # Exact match and word error rate em = [] wer = [] all_pred = [] all_gt = [] # Main test loop for i in range(num_ite): batch_idx = range(i * batch_size, (i + 1) * batch_size) if (batch_idx[-1] >= num_test): batch_idx = range(i * batch_size, num_test) batch_size = len(batch_idx) batch_x = util.batch_data(inkml_list[batch_idx], scale_list[batch_idx], is_train) batch_y_np = util.batch_target(inkml_list[batch_idx]) batch_y = util.np_to_var(batch_y_np, use_cuda) #pred_y, attention = net(batch_x, batch_y) pred_y, attention = net.beam_search(batch_x, start_id, stop_id) pred_y = util.var_to_np(pred_y, use_cuda) pred_y = np.argmax(pred_y, 2) batch_y = np.reshape(batch_y_np, (batch_size, max_len)) print('Finished ite %d/%d.' % (i, num_ite)) j = 0 pred_string = pred_y[j, :] pred_string = [id_to_word[idx] for idx in list(pred_string)] gt_string = batch_y[0, :] gt_string = [id_to_word[idx] for idx in list(gt_string)] all_pred.append(pred_string) all_gt.append(gt_string) em.append(util.exact_match(pred_string, gt_string)) if ('</s>' in pred_string): pred_string = pred_string[0:pred_string.index('</s>') + 1] gt_string = gt_string[0:gt_string.index('</s>') + 1] wer.append(util.levenshtein_distance(pred_string, gt_string)) if (i % 4 == 0): continue # Printing stuffs to console print('Prediction: %s' % ' '.join(pred_string)) print('Target: %s\n' % ' '.join(gt_string)) # Save attention to files for visualization file_name = ntpath.basename(inkml_list[batch_idx[j]])[:-6] vis_path_j = vis_path + file_name + '/' if (not os.path.exists(vis_path_j)): os.makedirs(vis_path_j) tmp_x = np.sum(batch_x.data.cpu().numpy()[j, :, :, :], axis=0) attention_np = attention.data.cpu().numpy()[j, 1:, :, :] pred_string = pred_string[1:] for k, word in enumerate(pred_string): word = word.replace('/', 'slash_') attention_k = attention_np[k, :, :] / np.max( attention_np[k, :, :]) * 0.8 attention_k = (scipy.misc.imresize(attention_k, 16.0)) / 255.0 tmp_x = scipy.misc.imresize(tmp_x, attention_k.shape) attention_k += tmp_x attention_k[attention_k > 1] = 1 try: scipy.misc.imsave(vis_path_j + ('%02d_%s.jpg' % (k, word)), attention_k) except FileNotFoundError: pdb.set_trace() if (word == '<slash_s>'): break #pdb.set_trace() print("Exact match count: %d/%d" % (sum(em), len(em))) print("Word error rate: %.5f" % (np.mean(wer))) pdb.set_trace() util.save_list([em, wer, all_pred, all_gt], save_path + test_name + '.dat') pdb.set_trace()