Beispiel #1
0
def suitable_candidate(tokens, i, c):
    """Checks if candidate is suitable for simplification."""
    # Checks candidate is less complex than original word.
    less_complex = zfreq(c[0], LANG) > zfreq(tokens[i], LANG)
    # Checks candidate is not morphological derivation.
    not_morph = c[0] not in tokens[i] and tokens[i] not in c[0]

    return less_complex and not_morph
def simplify_token(tokens, i):
    """Simplifies a token given index and all tokens."""
    # Generate candidates using BERT.
    candidates = generate_candidates(tokens, i)

    # Start ranking candidates on different features.
    candidates = [c for c in candidates
                  if suitable_candidate(tokens[i], c)][::-1]
    complex_ranked = sorted(candidates, key=lambda c: zfreq(c, config.lang))

    if models.embeddings:
        # If WEs have been loaded, include cosine and apsynp metrics.
        cosine_ranked = sorted(candidates,
                               key=lambda c: cosine_sim(tokens[i], c))
        apsynp_ranked = sorted(candidates,
                               key=lambda c: apsyn_sim(tokens[i], c))

        overall_ranked = [(c, candidates.index(c) + cosine_ranked.index(c) +
                           apsynp_ranked.index(c) + complex_ranked.index(c))
                          for c in candidates]

    else:
        # If WEs have not been loaded, only use BERT and frequency.
        overall_ranked = [(c, candidates.index(c) + complex_ranked.index(c))
                          for c in candidates]

    # Sort candidates based on overall rank.
    overall_ranked = sorted(overall_ranked, key=lambda c: c[1])

    return overall_ranked if overall_ranked else []
def suitable_complex_word(w):
    """Checks if detected word is suitable for replacing."""
    # Not stopword or punctuation.
    not_stopword = w not in safe_get_stop_words(config.lang) and w.isalpha()
    # Not a simple word (above defined threshold).
    not_simple = zfreq(w, config.lang) < config.min_complexity
    # No uppercase (ensures NEs are not simplified).
    not_uppercase = w.islower()

    return not_stopword and not_simple and not_uppercase
def suitable_candidate(w, c):
    """Checks if candidate is a suitable substitute based on
    various criteria."""
    source_stem = stemmer.stem(w)
    candidate_stem = stemmer.stem(c)

    # Check stem length.
    not_stem_len = not (len(candidate_stem) >= 3
                        and candidate_stem[:3] == source_stem[:3])
    # Not sharing stem with original word.
    not_equal_stem = source_stem != candidate_stem
    # Not punctuation
    not_punctuation = c.isalpha()

    # Other checks (disable when benchmarking).
    not_morph_deriv = c not in w and w not in c
    not_complex = zfreq(c, config.lang) > zfreq(w, config.lang)
    not_stopword = c not in safe_get_stop_words(config.lang) and c.isalpha()

    return not_equal_stem and not_stem_len and not_morph_deriv and not_stopword and not_complex
Beispiel #5
0
def simplify_token(tokens, i):
    """Simplifies a token given index and all tokens."""
    # Get top 10 similar words and remove those that are not suitable.
    candidates = wv_model.most_similar(tokens[i], topn=CANDIDATE_NO)
    candidates = [c for c in candidates if suitable_candidate(tokens, i, c)]
    # Rank candidates based on features.
    syntactic_ranked = sorted(candidates, key=lambda c: c[1])
    complexity_ranked = sorted(candidates, key=lambda c: zfreq(c[0], LANG))
    context_ranked = sorted(
        candidates, key=lambda c: context_sim(tokens, i, c[0], WINDOW_SIZE))
    # Calculate overall rank for each candidate.
    overall_ranked = [(c[0], syntactic_ranked.index(c) +
                       complexity_ranked.index(c) + context_ranked.index(c))
                      for c in candidates]

    return overall_ranked
Beispiel #6
0
    overall_ranked = [(c[0], syntactic_ranked.index(c) +
                       complexity_ranked.index(c) + context_ranked.index(c))
                      for c in candidates]

    return overall_ranked


if __name__ == '__main__':
    # If supplied, set text to user input.
    if len(sys.argv) > 1:
        raw_str = sys.argv[1]
    else:
        raw_str = "This is a particularly convoluted test sentence requiring simplification."

    # Copy of tokens to prevent changes to original.
    tokens = tokenize(raw_str)
    tokens_copy = tokens.copy()
    # For each copied token
    for i in range(len(tokens_copy)):
        # Make current word lowercase.
        tokens_copy[i] = tokens_copy[i].lower()
        # Conditions ensuring only valid and complex words are simplified.
        word_valid = tokens_copy[i] in wv_model.vocab and tokens_copy[
            i] not in STOPWORDS
        word_complex = zfreq(tokens_copy[i], LANG) < MIN_COMPLEXITY
        # Only simplify tokens in model and not in stopwords.
        if word_valid and word_complex:
            result = simplify_token(tokens_copy, i)
            print("Results for '" + tokens_copy[i] + "' - " + str(result) +
                  "\n")