Esempio n. 1
0
def get_group_norm(spacy_span):
    """
    Give a span, determine the its group and return the normalized text representing the group

    Args:
            spacy_span (spacy.tokens.Span)
    """
    np = spacy_span.text
    norm = spacy_normalizer(np, spacy_span.lemma_)
    if args.mark_char in norm:
        norm = norm.replace(args.mark_char, ' ')
    if np not in np2count:  # new np
        np2count[np] = 1
        np2id[np] = norm
        if norm in id2group:  # norm already exist
            id2group[norm].append(np)
        else:
            id2group[norm] = [np]
            id2rep[norm] = np
    else:  # another occurrence of this np. norm must exist and be consistent
        np2count[np] += 1
        if np2id[np] != norm:  # new norm to the same np - merge groups.
            #  no need to update np2id[np]
            norm = merge_groups(np, np2id[np], norm)  # set to the already exist
            #  norm so I know this norm is already in id2group/id2rep
        else:  # update rep
            if np2count[np] > np2count[id2rep[norm]]:
                id2rep[norm] = np  # replace rep

    return norm
Esempio n. 2
0
def get_group_norm(spacy_span):
    """
    Give a span, determine the its group and return the normalized text representing the group

    Args:
            spacy_span (spacy.tokens.Span)
    """
    np = spacy_span.text
    if np not in np2count:
        np2count[np] = 1
    else:
        np2count[np] += 1
    norm = spacy_normalizer(np, spacy_span.lemma_)
    if args.mark_char in norm:
        norm = norm.replace(args.mark_char, ' ')
    np2id[np] = norm
    if norm not in id2rep:
        id2rep[norm] = np
    if norm in id2group:
        if np not in id2group[norm]:
            id2group[norm].append(np)
        elif np2count[np] > np2count[id2rep[norm]]:
            id2rep[norm] = np  # replace rep
    else:
        id2group[norm] = [np]
        id2rep[norm] = np
    return norm