def get_group_norm(spacy_span): """ Give a span, determine the its group and return the normalized text representing the group Args: spacy_span (spacy.tokens.Span) """ np = spacy_span.text norm = spacy_normalizer(np, spacy_span.lemma_) if args.mark_char in norm: norm = norm.replace(args.mark_char, ' ') if np not in np2count: # new np np2count[np] = 1 np2id[np] = norm if norm in id2group: # norm already exist id2group[norm].append(np) else: id2group[norm] = [np] id2rep[norm] = np else: # another occurrence of this np. norm must exist and be consistent np2count[np] += 1 if np2id[np] != norm: # new norm to the same np - merge groups. # no need to update np2id[np] norm = merge_groups(np, np2id[np], norm) # set to the already exist # norm so I know this norm is already in id2group/id2rep else: # update rep if np2count[np] > np2count[id2rep[norm]]: id2rep[norm] = np # replace rep return norm
def get_group_norm(spacy_span): """ Give a span, determine the its group and return the normalized text representing the group Args: spacy_span (spacy.tokens.Span) """ np = spacy_span.text if np not in np2count: np2count[np] = 1 else: np2count[np] += 1 norm = spacy_normalizer(np, spacy_span.lemma_) if args.mark_char in norm: norm = norm.replace(args.mark_char, ' ') np2id[np] = norm if norm not in id2rep: id2rep[norm] = np if norm in id2group: if np not in id2group[norm]: id2group[norm].append(np) elif np2count[np] > np2count[id2rep[norm]]: id2rep[norm] = np # replace rep else: id2group[norm] = [np] id2rep[norm] = np return norm