def print_err_type(sent_orig, sent_predict, sent_default): err = edit_distance(sent_orig, sent_predict) words_orig = sent_orig.split(' ') words_predict = sent_predict.split(' ') words_default = sent_default.split(' ') if sent_predict == sent_default: err_type = "no_position" else: if len(words_predict) != len(words_orig): print "EXCEPTION: sentence length mismatch:" print "words_predict:", len(words_predict), words_predict print "words_orig :", len(words_orig), words_orig return n_words_diff = sum(w1 != w2 for w1, w2 in izip(words_predict, words_orig)) posn_right = (n_words_diff <= 1) if posn_right: err_type = "right_position" else: err_type = "wrong_position" added_chars = len(sent_predict) - len(sent_default) print "ERROR err:", err, "err_type:", err_type, "added_chars:", added_chars
def gen_text_similarity_feature(sa, sb, prefix='', ngrams_word_jaccard=[], use_char_ngram_jaccard=False, ngrams_char_jaccard=[3, 4, 5]): if not isinstance(sa, str) or not isinstance(sb, str): return {} feats = {} wa0 = tokenize0(sa) wb0 = tokenize0(sb) wa1 = tokenize1(sa) wb1 = tokenize1(sb) feats[prefix + 'word0_jaccard'] = jaccard(wa0, wb0) feats[prefix + 'word1_jaccard'] = jaccard(wa1, wb1) for n in ngrams_word_jaccard: feats[prefix + 'word0_jaccard_{}gram'.format(n)] = word_jaccard_ngram( wa0, wb0, n) feats[prefix + 'word1_jaccard_{}gram'.format(n)] = word_jaccard_ngram( wa1, wb1, n) if use_char_ngram_jaccard: for n in ngrams_char_jaccard: feats[prefix + 'char_jaccard_{}gram'.format(n)] = char_jaccard_ngram( sa, sb, n) feats[prefix + 'jw'] = jaro_winkler(sa, sb) feats[prefix + 'edit_distance_ratio'] = edit_distance(sa, sb) / (len(sa) + len(sb)) return feats
def calc_improvement(model, reg, max_lines=None): random.seed(RAND_SEED) err_default = 0 err_predict = 0 holdout = open(HOLDOUT_FILE, 'r') for line_num, line in enumerate(holdout): words = line.rstrip().split() if max_lines and line_num > max_lines: break if len(words) > 2: del_word_posn = random.randint(1, len(words) - 2) del_word = words[del_word_posn] words_orig = words words = words[0:del_word_posn] + words[(del_word_posn + 1):] else: words_orig = words sent_orig = ' '.join(words_orig) sent_default = ' '.join(words) sent_predict = model.best_sentence(words, reg) err_orig_default = edit_distance(sent_orig, sent_default) err_default += err_orig_default err_orig_predict = edit_distance(sent_orig, sent_predict) err_predict += err_orig_predict if False: # DEBUG print print print "sentence#:", line_num # print_err_type(sent_orig, sent_predict, sent_default) print "deleted_word:", del_word, print "pred_err:", edit_distance(sent_orig, sent_predict) for tag, s in [('orig:', sent_orig), ('dflt:', sent_default), ('pred:', sent_predict)]: print tag, edit_distance(sent_orig, s), s improvement = float(err_default - err_predict) / err_default #print "cumulative improvement:", improvement holdout.close() improvement = float(err_default - err_predict) / err_default return improvement
def make_results_list(res, artist_name): results = [] for item in map(dict, res): if edit_distance(artist_name, item['artist_name']) > 1 + max(len(artist_name), len(item['artist_name']) ) / 2: continue results.append(item) return results
def make_results_list(res, artist_name): results = [] for item in map(dict, res): ed = edit_distance(artist_name, item['artist_name']) if ed > 1 + max(len(artist_name), len(item['artist_name']) ) / 2: continue results.append( (ed, item)) results.sort() return [r[-1] for r in results]
def pairwise_comparison(raw_names, big_cities): " do comparisons" matches = defaultdict(dict) # for each raw place name, see how well it matchs the name of a big city for raw_name in raw_names: # loop over big cities for _, row in big_cities.iterrows(): big_city = row['mc'] key = ', '.join(row) # city, state # the first letters MUST match if raw_name[0].lower() != big_city[0].lower(): matches[raw_name][key] = np.inf else: matches[raw_name][key] = edit_distance(raw_name.lower(), big_city.lower()) return matches
def write_improved_submission(infile_name, outfile_name, substitutions): infile = csv.reader(open(infile_name, 'rb')) outfile = csv.writer(open(outfile_name, 'wb'), quoting=csv.QUOTE_NONNUMERIC) header = infile.next() outfile.writerow(header) nlines = 0 errs_fixed = 0 for line in infile: sentence_id, sentence = line sentence_id = int(sentence_id) if sentence_id in substitutions: new_sentence = substitutions[sentence_id] errs_fixed += edit_distance(new_sentence, sentence) else: new_sentence = sentence nlines += 1 outfile.writerow((sentence_id, new_sentence)) err_improvement = float(errs_fixed) / nlines print "Estimated improvement:", err_improvement