not_letters_or_digits = unicode(string.punctuation)
    translate_table = dict((ord(char), translate_to)
                           for char in not_letters_or_digits)
    return to_translate.translate(translate_table)

def strip_punc(s):
    s_out = s.translate(string.maketrans("",""), string.punctuation)
    return s_out
#Function names below are not exact
N = len(names)
t0 = time.time()
clean_names = [psCleanup.rem_diacritics(n) for n in names]
clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names]
clean_names = [psCleanup.stdize_case(n) for n in clean_names]
clean_names = [translate_non_alphanumerics(n) for n in clean_names]
clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts)
clean_names = [n.strip() for n in clean_names]
t1 = time.time()

### Works out to ~ 0.05s / entry
clean_time = t1 - t0
print clean_time / N

## Then pre-cluster by the leading 3 characters of the name
t0 = time.time()
leading_ngram_dict = psDisambig.build_leading_ngram_dict(clean_names, leading_n=3)
t1 = time.time()

leading_ngram_time = t1 - t0
print leading_ngram_time / N
    return to_translate.translate(translate_table)


def strip_punc(s):
    s_out = s.translate(string.maketrans("", ""), string.punctuation)
    return s_out


#Function names below are not exact
N = len(names)
t0 = time.time()
clean_names = [psCleanup.rem_diacritics(n) for n in names]
clean_names = [psCleanup.rem_trail_spaces(n) for n in clean_names]
clean_names = [psCleanup.stdize_case(n) for n in clean_names]
clean_names = [translate_non_alphanumerics(n) for n in clean_names]
clean_names = psCleanup.master_clean_dicts(clean_names, all_dicts)
clean_names = [re.sub(' ', '', n) for n in clean_names]
t1 = time.time()

## Define some blocking functions


def block_by_2_ngrams(name_string, ngram_length=2):
    block_dict = {}
    for name in name_string:
        these_ngrams = set([
            ''.join(name[j] for j in range(i, i + ngram_length))
            for i in range((len(name) - ngram_length))
        ])
        ngram_combos = get_combinatorial_pairs(list(these_ngrams))
        for ngram in ngram_combos:
Exemple #3
0
    FROM tls206_person INNER JOIN tls207_pers_appln ON tls206_person.person_id = tls207_pers_appln.person_id
        INNER JOIN tls201_appln ON tls201_appln.appln_id = tls207_pers_appln.appln_id
        INNER JOIN tls209_appln_ipc ON tls209_appln_ipc.appln_id  = tls207_pers_appln.appln_id 
    WHERE tls201_appln.appln_id = tls207_pers_appln.appln_id
          AND YEAR(tls201_appln.appln_filing_date) = """+ year +"""
    GROUP BY tls207_pers_appln.appln_id ORDER BY NULL LIMIT 10
    """


    date = time.strftime('%c', time.localtime()) 
    print 'Processing ' + year + '.    Started: '+ time.strftime('%c', time.localtime())

    time_start = time.time()
    
    query_output = myquery(dataextract)
    final_output = tuple_clean(query_output)

    time_end = time.time()
    elapsed_time = time_end - time_start
    total_elapsed_time += elapsed_time
    
    print 'Time elapsed for ' + year + ': ' + str(numpy.round(elapsed_time, 0))
    print 'Overall elapsed time: ' + str(numpy.round(elapsed_time, 0))


names = [q[2] for q in query_output]
name_time_start = time.time()
test = psCleanup.master_clean_dicts(names, psCleanup.cleanup_dicts)
name_time_end = time.time()
print name_time_end - name_time_start
def clean_list_wrapper(name_list, dict_list=all_dicts):
    out = [psCleanup.rem_diacritics(n) for n in name_list]
    out = [psCleanup.stdize_case(n) for n in out]
    out = psCleanup.master_clean_dicts(out, all_dicts)
    out = [n.strip() for n in out]
    return(out)
def clean_list_wrapper(name_list, dict_list=all_dicts):
    out = [psCleanup.rem_diacritics(n) for n in name_list]
    out = [psCleanup.stdize_case(n) for n in out]
    out = psCleanup.master_clean_dicts(out, all_dicts)
    out = [n.strip() for n in out]
    return (out)