def main(): parser = argparse.ArgumentParser( description='Rank corpus based on laser cosine distance') parser.add_argument('--debug', help='debug mode', action='store_true') parser.add_argument('--src_sents', help='source sentences') parser.add_argument('--tgt_sents', help='target sentences') parser.add_argument('--src_embs', help='laser embeddings for source sentences') parser.add_argument('--tgt_embs', help='laser embeddings for target sentences') parser.add_argument('--output_path', help='path to ranked corpus') parser.add_argument('--output_corpus', help='path to ranked corpus') o = parser.parse_args() try: os.makedirs(o.output_path) except FileExistsError: # directory already exists pass output_corpus = os.path.join(o.output_path, o.output_corpus) src_emb = load_laser_embs(o.src_embs) tgt_emb = load_laser_embs(o.tgt_embs) sim = [] for v1, v2 in zip(src_emb, tgt_emb): sim.append(similarity(v1, v2)) sim_sorted = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True) with open(output_corpus, 'w') as output, open(o.src_sents, 'r') as src, open(o.tgt_sents, 'r') as tgt: src = src.readlines() tgt = tgt.readlines() pbar = tqdm.tqdm(total=len(src)) for similarity_index in sim_sorted: pbar.update(1) src_sentence = src[similarity_index].strip() tgt_sentence = tgt[similarity_index].strip() # Exclude almost identical sentences or too short sentence-pairs; # exclude sentences containing a lot of numbers if levenshtein_distance( src_sentence, tgt_sentence) < 30 or perc_numeric(src_sentence) > 0.3: continue output.write('{0}\t{1}'.format(src[similarity_index].strip(), tgt[similarity_index])) output.close()
def normalize_word(self, word): """OOV module, 1st: compute levenshtein_distance, if not, return closest word using cosinus similarity""" if word in self.tokens.keys(): return word lv_distances = defaultdict(list) for token in self.tokens.keys(): distance = levenshtein_distance(word, token) for i in range(1, 3): if distance == i: lv_distances[i].append(token) break for i in range(1, 3): if lv_distances[i]: return lv_distances[i][0] return self.oov.closest_to_tokens(word, self.tokens.keys())
def _too_similar(candidate, already_considered, threshold=0.8): """ Computes ratio between 1 - Levenshtein_Distance and the longest keyword being considered. It runs this for every new candidate, comparing to every previously considered candidate. Returns True if, for any previous candidate, the ratio falls above a certain threshold. This threshold indicates that the pair of words are too similar. The candidates have previously been ordered in ascending weight. :param candidate: string(word) -> current keyword candidate being considered :param already_considered: list[words] -> every keyword already considered :param threshold: float(0-1) -> maximum threshold (similarity ratio) allowed :return: boolean """ return any((1.0 - levenshtein_distance(candidate, keyword)[0] / max(len(candidate), len(keyword)) > threshold) for keyword, _ in already_considered)
def retrieve_nearest_marker(lines1, attribute): if attribute == 0: lines2 = load_sentences(hp.data_path + '/delete/delete.test.0', False) else: lines2 = load_sentences(hp.data_path + '/delete/delete.test.1', False) sentences_contents_dict1 = dict(x.split('\t')[:2] for x in lines1) sentences_contents_dict2 = dict(x.split('\t')[:2] for x in lines2) sentences_marker_dict = dict([x.split('\t')[0], x.split('\t')[2]] for x in lines2) sentences1 = sentences_contents_dict1 sentences2 = sentences_contents_dict2 marker = sentences_marker_dict attribute_markers = [] for sentence1 in sentences1: dist_dict = {} sentence1_content = sentences1[sentence1] for sentence2 in sentences2.keys(): # distance between pos_content and neg_content dist_dict[sentence2] = levenshtein_distance(sentence1_content, sentences2[sentence2]) min_sentence = min(dist_dict, key=dist_dict.get) # nearest_marker attribute_markers.append(marker[min_sentence]) return attribute_markers
base_df['num_words_a'] = base_df['name_a'].apply( lambda x: u.get_number_words(x)) base_df['num_words_b'] = base_df['name_b'].apply( lambda x: u.get_number_words(x)) # Get the length of the strings base_df['len_a'] = base_df['name_a'].apply(lambda x: len(str(x))) base_df['len_b'] = base_df['name_b'].apply(lambda x: len(str(x))) # Get Jaro Winkler distance base_df['JW_distance'] = base_df.apply( lambda row: u.jaro_winkler_distance(row['name_a'], row['name_b']), axis=1) # Get Levenshtein distance base_df['LV_distance'] = base_df.apply( lambda row: u.levenshtein_distance(row['name_a'], row['name_b']), axis=1) # Get the target base_df['target'] = df['accept_match'].apply(lambda x: u.convert_target(x)) ################################### # Split the dataset into train, dev, test set ################################### base_df = base_df.sample(frac=1).reset_index(drop=True) non_numerical_cols = ['name_a', 'name_b', 'acr_a', 'acr_b'] feature_columns = [ 'acr_match', 'JW_distance', 'LV_distance', 'num_words_a', 'num_words_b', 'len_a', 'len_b' ]
def _predict_sequence(self, body, strict=False): """Predicts the part(s) of a filename root that is a/are number sequence(s), by comparing each of the components to the corresponding components of the filename roots inside the base directory. Args: body (list): Head, midsection, and tail of a filename root strict (bool): True, to check against all filename roots in the base directory, or by default False, to only compare to a minimum amount Raises: RuntimeError: Unable to predict number sequence for 'self.filename' Returns: A list of indices of the predicted body components. """ # Get the max. number of files from the base directory to check max_count = int( "1" + "".join(["0" for _ in range(len(str(self.fcount)) - 1)])) max_count = max_count if max_count > 10 else self.fcount # Predict the sequence(s) si = 0 # string start index of body component predicted = [] # indices of predicted sequence body components for i in xrange(len(body)): cln = len(body[i]) # string length of body component bln = sum([len(c) for c in body]) # total body string length total_dist = 0 # total Levenshtein distance count = 0 # number of checked files for f in self.files: if not strict and count >= max_count: break # skip the remaining files fpath = os.path.join(self.basedir, f) if is_file(fpath, self.ext.strip(".").upper())[0] \ and f != self.filename: froot, fext = os.path.splitext(f) # Get the string length difference between the corresponding # froot string length and the total body string length ld = len(froot) - bln # Get the froot segment to compare to the body component froot_seg = froot[si:si + cln] # Calculate the character offset between the body component and # the possibly longer froot segment (of a non-zero padded sequence) char_offset = (cln + ld) - cln if char_offset > 0: # len(froot_seg) > len(body[i]) for j in range(char_offset): idx = si + cln + j if not froot[idx].isdigit(): break froot_seg += froot[idx] # add digits only # Compute the Levenshtein distance dist = levenshtein_distance(froot_seg, body[i]) total_dist += dist count += 1 # Distances greater than zero mean a high sequence probability # because this body component changes from root to root, and # the digit verification filters out indices of body components # that are not numbers, especially important for a roots with # non-zero padded sequences, because the absence of padding # makes the root length bigger for higher ranging roots, and # thus a distance greater than 0 gets erroneously predicted if total_dist > 0 and body[i].isdigit(): predicted.append(i) # Increment si to the start index of the next body component si += cln # if len(predicted) < 1 or len(predicted) >= len(body): if len(predicted) < 1 or len(predicted) > len(body): raise RuntimeError( "Unable to predict number sequence for '{}'".format( self.filename)) return predicted
base_df["num_words_a"] = base_df["name_a"].apply( lambda x: u.get_number_words(x)) base_df["num_words_b"] = base_df["name_b"].apply( lambda x: u.get_number_words(x)) # Get the length of the strings base_df["len_a"] = base_df["name_a"].apply(lambda x: len(str(x))) base_df["len_b"] = base_df["name_b"].apply(lambda x: len(str(x))) # Get Jaro Winkler distance base_df["JW_distance"] = base_df.apply( lambda row: u.jaro_winkler_distance(row["name_a"], row["name_b"]), axis=1) # Get Levenshtein distance base_df["LV_distance"] = base_df.apply( lambda row: u.levenshtein_distance(row["name_a"], row["name_b"]), axis=1) # Get the target base_df["target"] = df["accept_match"].apply(lambda x: u.convert_target(x)) #### Split the dataset into train, dev, test set ##### Downsample Target Column base_df = base_df.sample(frac=1).reset_index(drop=True) # Separate majority and minority classes majority_df = base_df[base_df.target == base_df["target"].value_counts().index[0]] minority_df = base_df[base_df.target == base_df["target"].value_counts().index[-1]]
def daj_ciag(self, ciag, odleglosc): return list( filter(lambda x: levenshtein_distance(x, ciag) < odleglosc))
def create_corpus_from_wiki(self, corpus_root, filename, output_dir): create_error_corpus = False valid_word_pat = ur'(?u)^\w+$' sentences = utils.get_sentences_for_text(corpus_root, filename) if sentences == None: return top_rev = [] top_rev_with_err = [] try: for s_list in sentences: s = ' '.join(s_list) if s.startswith('[Revision timestamp:'): self.num_rev += 1 else: if self.num_rev == 1: if len(s_list) >= self.min_sen_len: rev_sen = RevisionSentence(s_list) top_rev.append(rev_sen) elif self.num_rev > 1: for r in top_rev: if len(s_list) == len(r.orig_tokens): valid_errors = True errors = False old_curr_rev_sen = zip(r.orig_tokens, s_list) for t in old_curr_rev_sen: dist = utils.levenshtein_distance( t[0], t[1]) if dist > 0 and dist <= self.max_dist: # token must be a word orig_uni = utils.to_unicode_or_bust( t[0]) match = re.search( valid_word_pat, orig_uni) if match: errors = True elif dist > self.max_dist: valid_errors = False break if errors == True and valid_errors == True: print 'errr' r.add_err_sentence(s_list) create_error_corpus = True break except AssertionError: print 'Empty file' if create_error_corpus == True: with codecs.open(output_dir + '/' + filename, 'w', 'utf-8', errors='ignore') as f: for r in top_rev: if r.contains_spelling_errors() == True: orig_sen = ' '.join(r.orig_tokens) err_as_sen = map(lambda x: ' '.join(x), r.err_sen) orig_err_sen = [orig_sen] + err_as_sen to_write = '####'.join(orig_err_sen) to_write_uni = unicode(to_write, encoding='utf-8', errors='ignore') f.write(to_write_uni + u'\n')
def template_based(): if not os.path.exists(hp.data_path + '/generate'): os.makedirs(hp.data_path + '/generate') print('template_based...') for num in ['0', '1']: if num == '0': neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1', False) neg_sentences_contents_dict = dict( x.split('\t')[:2] for x in neg_lines) pos_sentences_contents_dict = dict( x.split('\t')[:2] for x in pos_lines) pos_sentences_marker_dict = dict( [x.split('\t')[0], x.split('\t')[2]] for x in pos_lines) sentences1 = neg_sentences_contents_dict sentences2 = pos_sentences_contents_dict marker2 = pos_sentences_marker_dict else: neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1', False) neg_sentences_contents_dict = dict( x.split('\t')[:2] for x in neg_lines) pos_sentences_contents_dict = dict( x.split('\t')[:2] for x in pos_lines) neg_sentences_marker_dict = dict( [x.split('\t')[0], x.split('\t')[2]] for x in neg_lines) sentences1 = pos_sentences_contents_dict sentences2 = neg_sentences_contents_dict marker2 = neg_sentences_marker_dict with codecs.open(hp.data_path + '/generate/template_based.test.' + num, 'w', 'utf-8') as fout: for sentence1 in sentences1: dist_dict = {} # Search up to hp.max_candidates randomly. frag_sentences2 = random.sample(sentences2.keys(), hp.max_candidates) sentence1_content = sentences1[sentence1] for sentence2 in frag_sentences2: # distance between pos_content and neg_content dist_dict[sentence2] = levenshtein_distance( sentence1_content, sentences2[sentence2]) min_sentence = min(dist_dict, key=dist_dict.get) nearest_marker = marker2[min_sentence] sentence1_list = sentence1.split(' ') sentence1_content_list = sentences1[sentence1].split(' ') # Insert attribute markers in contents index = 0 for idx in range(len(sentence1_list)): if sentence1_list[idx] != sentence1_content_list[idx]: index = idx break generated_sentence = ' '.join(sentence1_content_list[:index]) + ' ' + \ nearest_marker + ' ' + ' '.join(sentence1_content_list[index:]) generated_sentence = generated_sentence.replace(' ', ' ') fout.write("- expected: " + sentence1 + "\n") fout.write("- got: " + generated_sentence + "\n\n") fout.flush()
def retrieve_only(dist_mode='levenshtein'): print('retrieve_only with ' + dist_mode + ' distance...') for num in ['0', '1']: if num == '0': neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1', False) neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines) pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines) sentences1 = neg_sentences_dict sentences2 = pos_sentences_dict else: neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1', False) neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines) pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines) sentences1 = pos_sentences_dict sentences2 = neg_sentences_dict with codecs.open(hp.data_path + '/generate/retrieve_only.test.' + num, 'w', 'utf-8') as fout: # Levenshtein distance if dist_mode == 'levenshtein': for sentence1 in sentences1: dist_dict = {} # Search up to hp.max_candidates randomly. frag_sentences2 = random.sample(sentences2.keys(), hp.max_candidates) for sentence2 in frag_sentences2: # distance between pos_content and neg_content dist_dict[sentence2] = levenshtein_distance( sentences1[sentence1], sentences2[sentence2]) nearest_sentence = min(dist_dict, key=dist_dict.get) fout.write("- expected: " + sentence1 + "\n") fout.write("- got: " + nearest_sentence + "\n\n") fout.flush() # Embedding distance between sentence1,sentence2 by using "universal sentence encoder[1]" # but it's too slow and not good performance if dist_mode == 'embedding': embed = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder/1") with tf.Session() as session: session.run([ tf.global_variables_initializer(), tf.tables_initializer() ]) embedded_sentences1 = session.run( embed(sentences1.values())) for sentence1, embedded_sentence1 in zip( sentences1.keys(), embedded_sentences1): dist_dict = {} # Search up to hp.max_candidates randomly. frag_sentences2 = random.sample( sentences2.keys(), hp.max_candidates) frag_contents2 = [] for frag_sentence2 in frag_sentences2: frag_contents2.append(sentences2[frag_sentence2]) embedded_sentences2 = session.run( embed(frag_contents2)) for idx, embedded_sentence2 in enumerate( embedded_sentences2): dist_dict[idx] = np.inner(embedded_sentence1, embedded_sentence2) nearest_idx = max(dist_dict, key=dist_dict.get) nearest_sentence = frag_sentences2[nearest_idx] fout.write("- expected: " + sentence1 + "\n") fout.write("- got: " + nearest_sentence + "\n\n") fout.flush()