def main():
    parser = argparse.ArgumentParser(
        description='Rank corpus based on laser cosine distance')
    parser.add_argument('--debug', help='debug mode', action='store_true')
    parser.add_argument('--src_sents', help='source sentences')
    parser.add_argument('--tgt_sents', help='target sentences')
    parser.add_argument('--src_embs',
                        help='laser embeddings for source sentences')
    parser.add_argument('--tgt_embs',
                        help='laser embeddings for target sentences')
    parser.add_argument('--output_path', help='path to ranked corpus')
    parser.add_argument('--output_corpus', help='path to ranked corpus')
    o = parser.parse_args()

    try:
        os.makedirs(o.output_path)
    except FileExistsError:
        # directory already exists
        pass

    output_corpus = os.path.join(o.output_path, o.output_corpus)

    src_emb = load_laser_embs(o.src_embs)
    tgt_emb = load_laser_embs(o.tgt_embs)

    sim = []
    for v1, v2 in zip(src_emb, tgt_emb):
        sim.append(similarity(v1, v2))

    sim_sorted = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True)

    with open(output_corpus,
              'w') as output, open(o.src_sents,
                                   'r') as src, open(o.tgt_sents, 'r') as tgt:
        src = src.readlines()
        tgt = tgt.readlines()

        pbar = tqdm.tqdm(total=len(src))

        for similarity_index in sim_sorted:
            pbar.update(1)
            src_sentence = src[similarity_index].strip()
            tgt_sentence = tgt[similarity_index].strip()

            # Exclude almost identical sentences or too short sentence-pairs;
            # exclude sentences containing a lot of numbers
            if levenshtein_distance(
                    src_sentence,
                    tgt_sentence) < 30 or perc_numeric(src_sentence) > 0.3:
                continue

            output.write('{0}\t{1}'.format(src[similarity_index].strip(),
                                           tgt[similarity_index]))

    output.close()
Example #2
0
    def normalize_word(self, word):
        """OOV module, 1st: compute levenshtein_distance, if not, return closest word using cosinus similarity"""
        if word in self.tokens.keys():
            return word
        lv_distances = defaultdict(list)
        for token in self.tokens.keys():
            distance = levenshtein_distance(word, token)
            for i in range(1, 3):
                if distance == i:
                    lv_distances[i].append(token)
                    break
        for i in range(1, 3):
            if lv_distances[i]:
                return lv_distances[i][0]

        return self.oov.closest_to_tokens(word, self.tokens.keys())
Example #3
0
    def _too_similar(candidate, already_considered, threshold=0.8):
        """
        Computes ratio between 1 - Levenshtein_Distance and the longest keyword being considered.
        It runs this for every new candidate, comparing to every previously considered candidate.

        Returns True if, for any previous candidate, the ratio falls above a certain threshold.
        This threshold indicates that the pair of words are too similar.

        The candidates have previously been ordered in ascending weight.

        :param candidate: string(word) -> current keyword candidate being considered
        :param already_considered: list[words] -> every keyword already considered
        :param threshold: float(0-1) -> maximum threshold (similarity ratio) allowed
        :return: boolean
        """

        return any((1.0 - levenshtein_distance(candidate, keyword)[0] /
                    max(len(candidate), len(keyword)) > threshold)
                   for keyword, _ in already_considered)
Example #4
0
def retrieve_nearest_marker(lines1, attribute):
    if attribute == 0:
        lines2 = load_sentences(hp.data_path + '/delete/delete.test.0', False)
    else:
        lines2 = load_sentences(hp.data_path + '/delete/delete.test.1', False)
    sentences_contents_dict1 = dict(x.split('\t')[:2] for x in lines1)
    sentences_contents_dict2 = dict(x.split('\t')[:2] for x in lines2)
    sentences_marker_dict = dict([x.split('\t')[0], x.split('\t')[2]] for x in lines2)
    sentences1 = sentences_contents_dict1
    sentences2 = sentences_contents_dict2
    marker = sentences_marker_dict
    attribute_markers = []
    for sentence1 in sentences1:
        dist_dict = {}
        sentence1_content = sentences1[sentence1]
        for sentence2 in sentences2.keys():
            # distance between pos_content and neg_content
            dist_dict[sentence2] = levenshtein_distance(sentence1_content, sentences2[sentence2])
        min_sentence = min(dist_dict, key=dist_dict.get)
        # nearest_marker
        attribute_markers.append(marker[min_sentence])
    return attribute_markers
Example #5
0
base_df['num_words_a'] = base_df['name_a'].apply(
    lambda x: u.get_number_words(x))
base_df['num_words_b'] = base_df['name_b'].apply(
    lambda x: u.get_number_words(x))

# Get the length of the strings
base_df['len_a'] = base_df['name_a'].apply(lambda x: len(str(x)))
base_df['len_b'] = base_df['name_b'].apply(lambda x: len(str(x)))

# Get Jaro Winkler distance
base_df['JW_distance'] = base_df.apply(
    lambda row: u.jaro_winkler_distance(row['name_a'], row['name_b']), axis=1)

# Get Levenshtein distance
base_df['LV_distance'] = base_df.apply(
    lambda row: u.levenshtein_distance(row['name_a'], row['name_b']), axis=1)

# Get the target
base_df['target'] = df['accept_match'].apply(lambda x: u.convert_target(x))

###################################
# Split the dataset into train, dev, test set
###################################

base_df = base_df.sample(frac=1).reset_index(drop=True)

non_numerical_cols = ['name_a', 'name_b', 'acr_a', 'acr_b']
feature_columns = [
    'acr_match', 'JW_distance', 'LV_distance', 'num_words_a', 'num_words_b',
    'len_a', 'len_b'
]
Example #6
0
    def _predict_sequence(self, body, strict=False):
        """Predicts the part(s) of a filename root that is a/are number
            sequence(s), by comparing each of the components to the
            corresponding components of the filename roots inside the
            base directory.

        Args:
          body (list): Head, midsection, and tail of a filename root
          strict (bool): True, to check against all filename roots in
            the base directory, or by default False, to only compare
            to a minimum amount

        Raises:
          RuntimeError: Unable to predict number sequence for 'self.filename'

        Returns:
          A list of indices of the predicted body components.
        """
        # Get the max. number of files from the base directory to check
        max_count = int(
            "1" + "".join(["0" for _ in range(len(str(self.fcount)) - 1)]))
        max_count = max_count if max_count > 10 else self.fcount
        # Predict the sequence(s)
        si = 0  # string start index of body component
        predicted = []  # indices of predicted sequence body components

        for i in xrange(len(body)):
            cln = len(body[i])  # string length of body component
            bln = sum([len(c) for c in body])  # total body string length
            total_dist = 0  # total Levenshtein distance
            count = 0  # number of checked files

            for f in self.files:
                if not strict and count >= max_count:
                    break  # skip the remaining files
                fpath = os.path.join(self.basedir, f)
                if is_file(fpath, self.ext.strip(".").upper())[0] \
                        and f != self.filename:
                    froot, fext = os.path.splitext(f)
                    # Get the string length difference between the corresponding
                    # froot string length and the total body string length
                    ld = len(froot) - bln
                    # Get the froot segment to compare to the body component
                    froot_seg = froot[si:si + cln]
                    # Calculate the character offset between the body component and
                    # the possibly longer froot segment (of a non-zero padded sequence)
                    char_offset = (cln + ld) - cln
                    if char_offset > 0:  # len(froot_seg) > len(body[i])
                        for j in range(char_offset):
                            idx = si + cln + j
                            if not froot[idx].isdigit():
                                break
                            froot_seg += froot[idx]  # add digits only
                    # Compute the Levenshtein distance
                    dist = levenshtein_distance(froot_seg, body[i])
                    total_dist += dist
                count += 1

            # Distances greater than zero mean a high sequence probability
            # because this body component changes from root to root, and
            # the digit verification filters out indices of body components
            # that are not numbers, especially important for a roots with
            # non-zero padded sequences, because the absence of padding
            # makes the root length bigger for higher ranging roots, and
            # thus a distance greater than 0 gets erroneously predicted
            if total_dist > 0 and body[i].isdigit():
                predicted.append(i)
            # Increment si to the start index of the next body component
            si += cln

        # if len(predicted) < 1 or len(predicted) >= len(body):
        if len(predicted) < 1 or len(predicted) > len(body):
            raise RuntimeError(
                "Unable to predict number sequence for '{}'".format(
                    self.filename))
        return predicted
Example #7
0
base_df["num_words_a"] = base_df["name_a"].apply(
    lambda x: u.get_number_words(x))
base_df["num_words_b"] = base_df["name_b"].apply(
    lambda x: u.get_number_words(x))

# Get the length of the strings
base_df["len_a"] = base_df["name_a"].apply(lambda x: len(str(x)))
base_df["len_b"] = base_df["name_b"].apply(lambda x: len(str(x)))

# Get Jaro Winkler distance
base_df["JW_distance"] = base_df.apply(
    lambda row: u.jaro_winkler_distance(row["name_a"], row["name_b"]), axis=1)

# Get Levenshtein distance
base_df["LV_distance"] = base_df.apply(
    lambda row: u.levenshtein_distance(row["name_a"], row["name_b"]), axis=1)

# Get the target
base_df["target"] = df["accept_match"].apply(lambda x: u.convert_target(x))

#### Split the dataset into train, dev, test set

##### Downsample Target Column
base_df = base_df.sample(frac=1).reset_index(drop=True)

# Separate majority and minority classes
majority_df = base_df[base_df.target ==
                      base_df["target"].value_counts().index[0]]
minority_df = base_df[base_df.target ==
                      base_df["target"].value_counts().index[-1]]
Example #8
0
 def daj_ciag(self, ciag, odleglosc):
     return list(
         filter(lambda x: levenshtein_distance(x, ciag) < odleglosc))
Example #9
0
    def create_corpus_from_wiki(self, corpus_root, filename, output_dir):
        create_error_corpus = False
        valid_word_pat = ur'(?u)^\w+$'
        sentences = utils.get_sentences_for_text(corpus_root, filename)
        if sentences == None:
            return
        top_rev = []
        top_rev_with_err = []
        try:
            for s_list in sentences:
                s = ' '.join(s_list)
                if s.startswith('[Revision timestamp:'):
                    self.num_rev += 1
                else:
                    if self.num_rev == 1:
                        if len(s_list) >= self.min_sen_len:
                            rev_sen = RevisionSentence(s_list)
                            top_rev.append(rev_sen)
                    elif self.num_rev > 1:
                        for r in top_rev:
                            if len(s_list) == len(r.orig_tokens):
                                valid_errors = True
                                errors = False
                                old_curr_rev_sen = zip(r.orig_tokens, s_list)
                                for t in old_curr_rev_sen:
                                    dist = utils.levenshtein_distance(
                                        t[0], t[1])
                                    if dist > 0 and dist <= self.max_dist:
                                        # token must be a word
                                        orig_uni = utils.to_unicode_or_bust(
                                            t[0])
                                        match = re.search(
                                            valid_word_pat, orig_uni)
                                        if match:
                                            errors = True
                                    elif dist > self.max_dist:
                                        valid_errors = False
                                        break
                                if errors == True and valid_errors == True:
                                    print 'errr'
                                    r.add_err_sentence(s_list)
                                    create_error_corpus = True
                                    break
        except AssertionError:
            print 'Empty file'

        if create_error_corpus == True:
            with codecs.open(output_dir + '/' + filename,
                             'w',
                             'utf-8',
                             errors='ignore') as f:
                for r in top_rev:
                    if r.contains_spelling_errors() == True:
                        orig_sen = ' '.join(r.orig_tokens)
                        err_as_sen = map(lambda x: ' '.join(x), r.err_sen)
                        orig_err_sen = [orig_sen] + err_as_sen
                        to_write = '####'.join(orig_err_sen)
                        to_write_uni = unicode(to_write,
                                               encoding='utf-8',
                                               errors='ignore')
                        f.write(to_write_uni + u'\n')
def template_based():
    if not os.path.exists(hp.data_path + '/generate'):
        os.makedirs(hp.data_path + '/generate')
    print('template_based...')
    for num in ['0', '1']:
        if num == '0':
            neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1',
                                       False)
            neg_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in neg_lines)
            pos_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in pos_lines)
            pos_sentences_marker_dict = dict(
                [x.split('\t')[0], x.split('\t')[2]] for x in pos_lines)
            sentences1 = neg_sentences_contents_dict
            sentences2 = pos_sentences_contents_dict
            marker2 = pos_sentences_marker_dict
        else:
            neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1',
                                       False)
            neg_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in neg_lines)
            pos_sentences_contents_dict = dict(
                x.split('\t')[:2] for x in pos_lines)
            neg_sentences_marker_dict = dict(
                [x.split('\t')[0], x.split('\t')[2]] for x in neg_lines)
            sentences1 = pos_sentences_contents_dict
            sentences2 = neg_sentences_contents_dict
            marker2 = neg_sentences_marker_dict
        with codecs.open(hp.data_path + '/generate/template_based.test.' + num,
                         'w', 'utf-8') as fout:
            for sentence1 in sentences1:
                dist_dict = {}
                # Search up to hp.max_candidates randomly.
                frag_sentences2 = random.sample(sentences2.keys(),
                                                hp.max_candidates)
                sentence1_content = sentences1[sentence1]
                for sentence2 in frag_sentences2:
                    # distance between pos_content and neg_content
                    dist_dict[sentence2] = levenshtein_distance(
                        sentence1_content, sentences2[sentence2])
                min_sentence = min(dist_dict, key=dist_dict.get)
                nearest_marker = marker2[min_sentence]
                sentence1_list = sentence1.split(' ')
                sentence1_content_list = sentences1[sentence1].split(' ')
                # Insert attribute markers in contents
                index = 0
                for idx in range(len(sentence1_list)):
                    if sentence1_list[idx] != sentence1_content_list[idx]:
                        index = idx
                        break
                generated_sentence = ' '.join(sentence1_content_list[:index]) + ' ' + \
                                     nearest_marker + ' ' + ' '.join(sentence1_content_list[index:])
                generated_sentence = generated_sentence.replace('  ', ' ')
                fout.write("- expected: " + sentence1 + "\n")
                fout.write("- got: " + generated_sentence + "\n\n")
                fout.flush()
def retrieve_only(dist_mode='levenshtein'):

    print('retrieve_only with ' + dist_mode + ' distance...')
    for num in ['0', '1']:
        if num == '0':
            neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1',
                                       False)
            neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines)
            pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines)
            sentences1 = neg_sentences_dict
            sentences2 = pos_sentences_dict
        else:
            neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0',
                                       False)
            pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1',
                                       False)
            neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines)
            pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines)
            sentences1 = pos_sentences_dict
            sentences2 = neg_sentences_dict
        with codecs.open(hp.data_path + '/generate/retrieve_only.test.' + num,
                         'w', 'utf-8') as fout:
            # Levenshtein distance
            if dist_mode == 'levenshtein':
                for sentence1 in sentences1:
                    dist_dict = {}
                    # Search up to hp.max_candidates randomly.
                    frag_sentences2 = random.sample(sentences2.keys(),
                                                    hp.max_candidates)
                    for sentence2 in frag_sentences2:
                        # distance between pos_content and neg_content
                        dist_dict[sentence2] = levenshtein_distance(
                            sentences1[sentence1], sentences2[sentence2])
                    nearest_sentence = min(dist_dict, key=dist_dict.get)
                    fout.write("- expected: " + sentence1 + "\n")
                    fout.write("- got: " + nearest_sentence + "\n\n")
                    fout.flush()

            # Embedding distance between sentence1,sentence2 by using "universal sentence encoder[1]"
            # but it's too slow and not good performance
            if dist_mode == 'embedding':
                embed = hub.Module(
                    "https://tfhub.dev/google/universal-sentence-encoder/1")
                with tf.Session() as session:
                    session.run([
                        tf.global_variables_initializer(),
                        tf.tables_initializer()
                    ])
                    embedded_sentences1 = session.run(
                        embed(sentences1.values()))
                    for sentence1, embedded_sentence1 in zip(
                            sentences1.keys(), embedded_sentences1):
                        dist_dict = {}
                        # Search up to hp.max_candidates randomly.
                        frag_sentences2 = random.sample(
                            sentences2.keys(), hp.max_candidates)
                        frag_contents2 = []
                        for frag_sentence2 in frag_sentences2:
                            frag_contents2.append(sentences2[frag_sentence2])
                        embedded_sentences2 = session.run(
                            embed(frag_contents2))
                        for idx, embedded_sentence2 in enumerate(
                                embedded_sentences2):
                            dist_dict[idx] = np.inner(embedded_sentence1,
                                                      embedded_sentence2)
                        nearest_idx = max(dist_dict, key=dist_dict.get)
                        nearest_sentence = frag_sentences2[nearest_idx]
                        fout.write("- expected: " + sentence1 + "\n")
                        fout.write("- got: " + nearest_sentence + "\n\n")
                        fout.flush()