clean_time = t1 - t0 print clean_time / N ## Then pre-cluster by the leading 3 characters of the name t0 = time.time() leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3) t1 = time.time() leading_ngram_time = t1 - t0 print leading_ngram_time / N ## Then do the disambig on each list: out = {} t0 = time.time() n_gram = 2 for k, v in leading_ngram_dict.iteritems(): #print k if len(k) > 1 and len(v) > n_gram: mat = psDisambig.build_incremental_ngram_mat(v, n=n_gram ) if mat['tf_matrix'] is not None: out[k] = psDisambig.cosine_similarity_match(mat['tf_matrix']) else: out[k] = None t1 = time.time() dict_match_time = t1 - t0 print dict_match_time / N
### Works out to ~ 0.05s / entry clean_time = t1 - t0 print clean_time / N ## Then pre-cluster by the leading 3 characters of the name t0 = time.time() leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3) t1 = time.time() leading_ngram_time = t1 - t0 print leading_ngram_time / N ## Then do the disambig on each list: out = {} t0 = time.time() n_gram = 2 for k, v in leading_ngram_dict.iteritems(): #print k if len(k) > 1 and len(v) > n_gram: mat = psDisambig.build_incremental_ngram_mat(v, n=n_gram) if mat['tf_matrix'] is not None: out[k] = psDisambig.cosine_similarity_match(mat['tf_matrix']) else: out[k] = None t1 = time.time() dict_match_time = t1 - t0 print dict_match_time / N
for name in company_potential_matches[k]: leading_letter_hash = name[0:leading_n] if leading_letter_hash in block_dict: block_dict[leading_letter_hash].append(name) else: block_dict[leading_letter_hash] = [name] match_list = [] for block in block_dict: ## Build up the ngram matrix for this canonical name and ## all potential matches this_block = block_dict[block] whole_block = this_block[:] whole_block.append(k) name_row = len(whole_block) - 1 mat = psDisambig.build_incremental_ngram_mat(whole_block, n=ngram ) if mat['tf_matrix'] is not None: if mat['tf_matrix'].shape[0] > 1: ## COO matrix allows better slicing mat['tf_matrix'] = mat['tf_matrix'].tocoo() sim = psDisambig.rowwise_cosine_similarity(mat['tf_matrix'], mat['tf_matrix'].getrow(name_row) ) for colnum in range(sim.shape[1]): if sim[0, colnum] > threshold and colnum != name_row: match = [(whole_block[colnum], sim[0, colnum])] if len(match) > 0: match_list.extend(match) filename = './data/candidate_matches/' + k + '_' + str(threshold) + '_candidate_match_' + '23_07_2012.csv'
clean_names = [psCleanup.rem_diacritics(n) for n in names] clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names] clean_names = [psCleanup.stdize_case(n) for n in clean_names] clean_names = [translate_non_alphanumerics(n) for n in clean_names] clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts) clean_names = [n.strip() for n in clean_names] t1 = time.time() ### Works out to ~ 0.05s / entry clean_time = t1 - t0 print clean_time / N ## Note that this was 0.0003 s / name this time around. ## Then distance the cleaned names t0 = time.time() ngram_mat = psDisambig.build_incremental_ngram_mat(clean_names, n=2) t1 = time.time() ## Works out to ~ 0.0005s / entry ngram_mat_time = t1 - t0 print ngram_mat_time / N ## Time creation of the ngram dict t0 = time.time() leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3) t1 = time.time() leading_ngram_time = t1 - t0 print leading_ngram_time / N
clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names] clean_names = [psCleanup.stdize_case(n) for n in clean_names] clean_names = [translate_non_alphanumerics(n) for n in clean_names] clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts) clean_names = [n.strip() for n in clean_names] t1 = time.time() ### Works out to ~ 0.05s / entry clean_time = t1 - t0 print clean_time / N ## Note that this was 0.0003 s / name this time around. ## Then distance the cleaned names t0 = time.time() ngram_mat = psDisambig.build_incremental_ngram_mat(clean_names, n=2) t1 = time.time() ## Works out to ~ 0.0005s / entry ngram_mat_time = t1 - t0 print ngram_mat_time / N ## Time creation of the ngram dict t0 = time.time() leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3) t1 = time.time() leading_ngram_time = t1 - t0 print leading_ngram_time / N