コード例 #1
0
def print_err_type(sent_orig, sent_predict, sent_default):

    err = edit_distance(sent_orig, sent_predict)
    words_orig = sent_orig.split(' ')
    words_predict = sent_predict.split(' ')
    words_default = sent_default.split(' ')

    if sent_predict == sent_default:
        err_type = "no_position"
    else:

        if len(words_predict) != len(words_orig):
            print "EXCEPTION: sentence length mismatch:"
            print "words_predict:", len(words_predict), words_predict
            print "words_orig   :", len(words_orig), words_orig
            return

        n_words_diff = sum(w1 != w2
                           for w1, w2 in izip(words_predict, words_orig))
        posn_right = (n_words_diff <= 1)
        if posn_right:
            err_type = "right_position"
        else:
            err_type = "wrong_position"

    added_chars = len(sent_predict) - len(sent_default)
    print "ERROR err:", err, "err_type:", err_type, "added_chars:", added_chars
コード例 #2
0
def gen_text_similarity_feature(sa,
                                sb,
                                prefix='',
                                ngrams_word_jaccard=[],
                                use_char_ngram_jaccard=False,
                                ngrams_char_jaccard=[3, 4, 5]):
    if not isinstance(sa, str) or not isinstance(sb, str):
        return {}
    feats = {}

    wa0 = tokenize0(sa)
    wb0 = tokenize0(sb)
    wa1 = tokenize1(sa)
    wb1 = tokenize1(sb)

    feats[prefix + 'word0_jaccard'] = jaccard(wa0, wb0)
    feats[prefix + 'word1_jaccard'] = jaccard(wa1, wb1)

    for n in ngrams_word_jaccard:
        feats[prefix + 'word0_jaccard_{}gram'.format(n)] = word_jaccard_ngram(
            wa0, wb0, n)
        feats[prefix + 'word1_jaccard_{}gram'.format(n)] = word_jaccard_ngram(
            wa1, wb1, n)

    if use_char_ngram_jaccard:
        for n in ngrams_char_jaccard:
            feats[prefix +
                  'char_jaccard_{}gram'.format(n)] = char_jaccard_ngram(
                      sa, sb, n)

    feats[prefix + 'jw'] = jaro_winkler(sa, sb)
    feats[prefix +
          'edit_distance_ratio'] = edit_distance(sa, sb) / (len(sa) + len(sb))

    return feats
コード例 #3
0
def calc_improvement(model, reg, max_lines=None):

    random.seed(RAND_SEED)
    err_default = 0
    err_predict = 0

    holdout = open(HOLDOUT_FILE, 'r')
    for line_num, line in enumerate(holdout):
        words = line.rstrip().split()
        if max_lines and line_num > max_lines:
            break
        if len(words) > 2:
            del_word_posn = random.randint(1, len(words) - 2)
            del_word = words[del_word_posn]
            words_orig = words
            words = words[0:del_word_posn] + words[(del_word_posn + 1):]
        else:
            words_orig = words

        sent_orig = ' '.join(words_orig)
        sent_default = ' '.join(words)
        sent_predict = model.best_sentence(words, reg)

        err_orig_default = edit_distance(sent_orig, sent_default)
        err_default += err_orig_default
        err_orig_predict = edit_distance(sent_orig, sent_predict)
        err_predict += err_orig_predict

        if False:  # DEBUG print
            print
            print "sentence#:", line_num
            # print_err_type(sent_orig, sent_predict, sent_default)
            print "deleted_word:", del_word,
            print "pred_err:", edit_distance(sent_orig, sent_predict)
            for tag, s in [('orig:', sent_orig), ('dflt:', sent_default),
                           ('pred:', sent_predict)]:
                print tag, edit_distance(sent_orig, s), s

        improvement = float(err_default - err_predict) / err_default
        #print "cumulative improvement:", improvement

    holdout.close()
    improvement = float(err_default - err_predict) / err_default
    return improvement
コード例 #4
0
def make_results_list(res, artist_name):

    results = []

    for item in map(dict, res):
        if edit_distance(artist_name, item['artist_name']) > 1 + max(len(artist_name), len(item['artist_name']) ) / 2:
            continue
        results.append(item)

    return results
コード例 #5
0
def make_results_list(res, artist_name):

    results = []

    for item in map(dict, res):
        ed = edit_distance(artist_name, item['artist_name'])

        if ed > 1 + max(len(artist_name), len(item['artist_name']) ) / 2:
            continue
        results.append( (ed, item))

    results.sort()

    return [r[-1] for r in results]
コード例 #6
0
def pairwise_comparison(raw_names, big_cities):
    " do comparisons"
    matches = defaultdict(dict)
    # for each raw place name, see how well it matchs the name of a big city
    for raw_name in raw_names:
        # loop over big cities
        for _, row in big_cities.iterrows():
            big_city = row['mc']
            key = ', '.join(row)  # city, state
            # the first letters MUST match
            if raw_name[0].lower() != big_city[0].lower():
                matches[raw_name][key] = np.inf
            else:
                matches[raw_name][key] = edit_distance(raw_name.lower(),
                                                       big_city.lower())
    return matches
コード例 #7
0
def write_improved_submission(infile_name, outfile_name, substitutions):
    infile = csv.reader(open(infile_name, 'rb'))
    outfile = csv.writer(open(outfile_name, 'wb'),
                         quoting=csv.QUOTE_NONNUMERIC)
    header = infile.next()
    outfile.writerow(header)

    nlines = 0
    errs_fixed = 0
    for line in infile:
        sentence_id, sentence = line
        sentence_id = int(sentence_id)
        if sentence_id in substitutions:
            new_sentence = substitutions[sentence_id]
            errs_fixed += edit_distance(new_sentence, sentence)
        else:
            new_sentence = sentence
        nlines += 1
        outfile.writerow((sentence_id, new_sentence))

    err_improvement = float(errs_fixed) / nlines
    print "Estimated improvement:", err_improvement