clean_time = t1 - t0
print clean_time / N

## Then pre-cluster by the leading 3 characters of the name
t0 = time.time()
leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3)
t1 = time.time()

leading_ngram_time = t1 - t0
print leading_ngram_time / N

## Then do the disambig on each list:
out = {}
t0 = time.time()
n_gram = 2
for k, v in leading_ngram_dict.iteritems():
    #print k
    if len(k) > 1 and len(v) > n_gram:
        mat = psDisambig.build_incremental_ngram_mat(v,
                                                     n=n_gram
                                                     )
        if mat['tf_matrix'] is not None:
            out[k] = psDisambig.cosine_similarity_match(mat['tf_matrix'])
        else:
            out[k] = None
        
t1 = time.time()

dict_match_time = t1 - t0
print dict_match_time / N
Example #2
0
### Works out to ~ 0.05s / entry
clean_time = t1 - t0
print clean_time / N

## Then pre-cluster by the leading 3 characters of the name
t0 = time.time()
leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names,
                                                         leading_n=3)
t1 = time.time()

leading_ngram_time = t1 - t0
print leading_ngram_time / N

## Then do the disambig on each list:
out = {}
t0 = time.time()
n_gram = 2
for k, v in leading_ngram_dict.iteritems():
    #print k
    if len(k) > 1 and len(v) > n_gram:
        mat = psDisambig.build_incremental_ngram_mat(v, n=n_gram)
        if mat['tf_matrix'] is not None:
            out[k] = psDisambig.cosine_similarity_match(mat['tf_matrix'])
        else:
            out[k] = None

t1 = time.time()

dict_match_time = t1 - t0
print dict_match_time / N
 for name in company_potential_matches[k]:
     leading_letter_hash = name[0:leading_n]
     if leading_letter_hash in block_dict:
         block_dict[leading_letter_hash].append(name)
     else:
         block_dict[leading_letter_hash] = [name]
 match_list = []
 for block in block_dict:
     ## Build up the ngram matrix for this canonical name and
     ## all potential matches
     this_block = block_dict[block]
     whole_block = this_block[:]
     whole_block.append(k)
     name_row = len(whole_block) - 1
     mat = psDisambig.build_incremental_ngram_mat(whole_block,
                                                  n=ngram
                                                  )
     if mat['tf_matrix'] is not None:
         if mat['tf_matrix'].shape[0] > 1:
             ## COO matrix allows better slicing
             mat['tf_matrix'] = mat['tf_matrix'].tocoo()
             sim = psDisambig.rowwise_cosine_similarity(mat['tf_matrix'],
                                                        mat['tf_matrix'].getrow(name_row)
                                                        )
             for colnum in range(sim.shape[1]):
                 if sim[0, colnum] > threshold and colnum != name_row:
                     match = [(whole_block[colnum], sim[0, colnum])]
                      if len(match) > 0:
                         match_list.extend(match)
     
 filename = './data/candidate_matches/' + k + '_' + str(threshold) + '_candidate_match_' + '23_07_2012.csv'
Example #4
0
clean_names = [psCleanup.rem_diacritics(n) for n in names]
clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names]
clean_names = [psCleanup.stdize_case(n) for n in clean_names]
clean_names = [translate_non_alphanumerics(n) for n in clean_names]
clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts)
clean_names = [n.strip() for n in clean_names]
t1 = time.time()

### Works out to ~ 0.05s / entry
clean_time = t1 - t0
print clean_time / N
## Note that this was 0.0003 s / name this time around.

## Then distance the cleaned names
t0 = time.time()
ngram_mat = psDisambig.build_incremental_ngram_mat(clean_names, n=2)
t1 = time.time()

## Works out to ~ 0.0005s / entry
ngram_mat_time = t1 - t0
print ngram_mat_time / N

## Time creation of the ngram dict
t0 = time.time()
leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names,
                                                         leading_n=3)
t1 = time.time()

leading_ngram_time = t1 - t0
print leading_ngram_time / N
clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names]
clean_names = [psCleanup.stdize_case(n) for n in clean_names]
clean_names = [translate_non_alphanumerics(n) for n in clean_names]
clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts)
clean_names = [n.strip() for n in clean_names]
t1 = time.time()

### Works out to ~ 0.05s / entry
clean_time = t1 - t0
print clean_time / N
## Note that this was 0.0003 s / name this time around.


## Then distance the cleaned names
t0 = time.time()
ngram_mat = psDisambig.build_incremental_ngram_mat(clean_names, n=2)
t1 = time.time()

## Works out to ~ 0.0005s / entry
ngram_mat_time = t1 - t0
print ngram_mat_time / N


## Time creation of the ngram dict
t0 = time.time()
leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3)
t1 = time.time()

leading_ngram_time = t1 - t0
print leading_ngram_time / N