def postprocess_ml(split, token_counters, model):
    merged_tokens = []
    for token in split:
        if len(merged_tokens) == 0:
            merged_tokens.append(token)
        else:
            last = merged_tokens[-1]
            merged = last + token
            if is_word(merged, token_counters) and not predict_split(
                    model, last, token, token_counters):
                merged_tokens[-1] = merged
            else:
                merged_tokens.append(token)
    postprocessed = []
    for token in merged_tokens:
        did_split = False
        for split_pos in range(1, len(token)):
            prefix = token[:split_pos]
            suffix = token[split_pos:]
            if is_word(prefix, token_counters) and is_word(suffix, token_counters) \
                    and predict_split(model, prefix, suffix, token_counters):
                postprocessed.append(prefix)
                postprocessed.append(suffix)
                did_split = True
                break
        if not did_split:
            postprocessed.append(token)
    return postprocessed
def combine_mergable_tokens_until_two_correct(tokens, token_counters):
    token_lists = []
    t_i = 0
    while t_i < len(tokens):
        current = tokens[t_i]
        if not current[0]:
            token_lists.append((False, [current[1]]))
        else:
            words_to_merge = [current[1]]
            #original_space_positions = []
            consecutive_correct = 1 if is_word(current[1],
                                               token_counters) else 0
            while t_i < len(tokens) - 2:
                if tokens[t_i + 1][1] == ' ' and tokens[t_i + 2][0]:
                    word = tokens[t_i + 2][1]
                    if is_word(word, token_counters):
                        consecutive_correct += 1
                    else:
                        consecutive_correct = 0
                    if consecutive_correct == 2:
                        break
                    #original_space_positions.append(len(word)
                    #                                + (0 if len(original_space_positions) == 0
                    #                                   else original_space_positions[-1]))
                    words_to_merge.append(word)
                    t_i += 2
                else:
                    break
            token_lists.append((True, words_to_merge))
        t_i += 1
    return token_lists
def number_of_nonwords(tokens, word_counters, limit=None):
    nonwords = 0
    for token in tokens:
        if not is_word(token, word_counters):
            nonwords += 1
            if limit is not None and nonwords > limit:
                return nonwords
    return nonwords
def get_split_candidates(sequence, token_counters):
    candidates = [[sequence]]
    for i in range(len(sequence)):
        prefix = sequence[:i]
        if is_word(prefix, token_counters):
            follow_up_candidates = get_split_candidates(
                sequence[i:], token_counters)
            for follow_up in follow_up_candidates:
                candidates.append([prefix] + follow_up)
    return candidates
Ejemplo n.º 5
0
def get_word_positions(string, word_dict, max_word_len=None):
    positions = []
    if max_word_len is None:
        max_word_len = len(string)
    for i in range(len(string)):
        end = min(len(string), i + max_word_len) + 1
        for j in range(i + 1, end):
            if is_word(string[i:j], word_dict):
                positions.append((i, j))
    return positions
def negative_examples(token_lists, word_counters):
    neg_c_tok = []
    neg_c_pre = []
    neg_c_suf = []
    for token_list in token_lists:
        for token in token_list:
            for split_pos in range(1, len(token)):
                prefix = token[:split_pos]
                suffix = token[split_pos:]
                if is_word(prefix, word_counters) and is_word(suffix, word_counters):
                    token_count = get_count(token, word_counters)
                    prefix_count = get_count(prefix, word_counters)
                    suffix_count = get_count(suffix, word_counters)
                    neg_c_tok.append(token_count)
                    neg_c_pre.append(prefix_count)
                    neg_c_suf.append(suffix_count)
    X = feature_matrix(neg_c_pre, neg_c_suf, neg_c_tok)
    y = [0] * len(neg_c_tok)
    return X, y
def positive_examples(token_lists, word_counters):
    pos_c_tok = []
    pos_c_pre = []
    pos_c_suf = []
    for token_list in token_lists:
        for prefix, suffix in zip(token_list[:-1], token_list[1:]):
            merged = prefix + suffix
            if is_word(merged, word_counters):
                merged_count = get_count(merged, word_counters)
                prefix_count = get_count(prefix, word_counters)
                suffix_count = get_count(suffix, word_counters)
                pos_c_tok.append(merged_count)
                pos_c_pre.append(prefix_count)
                pos_c_suf.append(suffix_count)
    X = feature_matrix(pos_c_pre, pos_c_suf, pos_c_tok)
    y = [1] * len(pos_c_tok)
    return X, y
def get_best_splits_naive(tokens, word_counters):
    #Finds the best splits based on number of nonwords and number of operations.
    orig_nonwords = sum(
        [0 if is_word(token, word_counters) else 1 for token in tokens])
    orig_space_positions = cumsum([len(token) for token in tokens[:-1]])
    merged = "".join(tokens)
    word_positions = get_word_positions(merged, word_counters)
    candidates = candidates_from_word_positions(merged, word_positions,
                                                word_counters,
                                                orig_nonwords - 1)
    # original split as candidate:
    if len(tokens) == 1:
        original_split = [(0, len(merged))]
    else:
        original_split = [(0, orig_space_positions[0])]
        for i in range(len(orig_space_positions[:-1])):
            original_split.append(
                (orig_space_positions[i], orig_space_positions[i + 1]))
        original_split.append((orig_space_positions[-1], len(merged)))
    candidates.append(original_split)
    candidates = set([tuple(c) for c in candidates])
    best_splits = []
    best_n_nonwords = len(merged)
    best_n_operations = len(merged)
    for c in candidates:
        c_words = [merged[pos[0]:pos[1]] for pos in c]
        c_n_nonwords = number_of_nonwords(c_words,
                                          word_counters,
                                          limit=best_n_nonwords)
        if c_n_nonwords <= best_n_nonwords:
            c_n_operations = number_of_operations(c, orig_space_positions)
        else:
            c_n_operations = len(merged)
        if c_n_nonwords < best_n_nonwords \
                or (c_n_nonwords == best_n_nonwords and c_n_operations < best_n_operations):
            best_n_nonwords = c_n_nonwords
            best_n_operations = c_n_operations
            best_splits = [c]
        elif c_n_nonwords == best_n_nonwords and c_n_operations == best_n_operations:
            best_splits.append(c)
    best_splits = [list(split) for split in best_splits]
    return best_splits