def preserve_case(token): """ Returns True if `token` is a proper noun or acronym, False otherwise. Args: token (``spacy.Token``): parent document must have POS information Returns: bool """ if token.doc.is_tagged is False: raise ValueError('token is not POS-tagged') return token.pos == PROPN or is_acronym(token.text)
def preserve_case(token): """ Returns True if `token` is a proper noun or acronym, False otherwise. Args: token (``spacy.Token``): parent document must have POS information Returns: bool """ if token.doc.is_tagged is False: raise ValueError('token is not POS-tagged') return token.pos == PROPN or is_acronym(token.text)
def preserve_case(token): """ Returns True if `token` is a proper noun or acronym, False otherwise. Args: token (``spacy.Token``): parent document must have POS information Returns: bool TODO: use universal pos PROPN instead of english-specific tags as soon as Honnibal decides to include them in his model... """ if token.doc.is_tagged is False: raise ValueError('token is not POS-tagged') return token.tag_ in {'NNP', 'NNPS'} or is_acronym(token.text)
def _get_acronym_definition(acronym, window, threshold=0.8): """ Identify most likely definition for an acronym given a list of tokens. Args: acronym (str): acronym for which definition is sought window (``spacy.Span``): a span of tokens from which definition extraction will be attempted threshold (float, optional): minimum "confidence" in definition required for acceptance; valid values in [0.0, 1.0]; higher value => stricter threshold Returns: Tuple[str, float]: most likely definition for given acronym ('' if none found), along with the confidence assigned to it References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ def build_lcs_matrix(X, Y): m = len(X) n = len(Y) b = zeros((m, n), dtype=int) c = zeros((m, n), dtype=int) for i in range(0, m): for j in range(0, n): if X[i] == Y[j]: c[i, j] = c[i - 1, j - 1] + 1 b[i, j] = 1 elif c[i - 1, j] >= c[i, j - 1]: c[i, j] = c[i - 1, j] else: c[i, j] = c[i, j - 1] return c, b def parse_lcs_matrix(b, start_i, start_j, lcs_length, stack, vectors): m = b.shape[0] n = b.shape[1] for i in range(start_i, m): for j in range(start_j, n): if b[i, j] == 1: s = (i, j) stack.append(s) if lcs_length == 1: vec = [NaN] * n for k, l in stack: vec[l] = k vectors.append(vec) else: parse_lcs_matrix(b, i + 1, j + 1, lcs_length - 1, stack, vectors) stack = [] return vectors def vector_values(v, types): vv = {} first = v.index(int(nanmin(v))) last = v.index(int(nanmax(v))) vv['size'] = (last - first) + 1 vv['distance'] = len(v) - last vv['stop_count'] = 0 vv['misses'] = 0 for i in range(first, last + 1): if v[i] >= 0 and types[i] == 's': vv['stop_count'] += 1 elif v[i] is None and types[i] not in ['s', 'h']: vv['misses'] += 1 return vv def compare_vectors(A, B, types): vv_A = vector_values(A, types) vv_B = vector_values(B, types) # no one-letter matches, sorryboutit if vv_A['size'] == 1: return B elif vv_B['size'] == 1: return A if vv_A['misses'] > vv_B['misses']: return B elif vv_A['misses'] < vv_B['misses']: return A if vv_A['stop_count'] > vv_B['stop_count']: return B if vv_A['stop_count'] < vv_B['stop_count']: return A if vv_A['distance'] > vv_B['distance']: return B elif vv_A['distance'] < vv_B['distance']: return A if vv_A['size'] > vv_B['size']: return B elif vv_A['size'] < vv_B['size']: return A return A # get definition window's leading characters and word types def_leads = [] def_types = [] for tok in window: tok_text = tok.text if tok.is_stop: def_leads.append(tok_text[0]) def_types.append('s') elif text_utils.is_acronym(tok_text): def_leads.append(tok_text[0]) def_types.append('a') elif '-' in tok_text and not tok_text.startswith('-'): tok_split = [t[0] for t in tok_text.split('-') if t] def_leads.extend(tok_split) def_types.extend('H' if i == 0 else 'h' for i in range(len(tok_split))) else: def_leads.append(tok_text[0]) def_types.append('w') def_leads = ''.join(def_leads).lower() def_types = ''.join(def_types) # extract alphanumeric characters from acronym acr_leads = ''.join(c for c in acronym if c.isalnum()) # handle special cases of '&' and trailing 's' acr_leads = acr_leads.replace('&', 'a') if acr_leads.endswith('s'): # bail out if it's only a 2-letter acronym to start with, e.g. 'Is' if len(acr_leads) == 2: return ('', 0) acr_leads = acr_leads[:-1] acr_leads = acr_leads.lower() c, b = build_lcs_matrix(acr_leads, def_leads) # 4.4.1 lcs_length = c[c.shape[0] - 1, c.shape[1] - 1] confidence = lcs_length / len(acronym) if confidence < threshold: return ('', confidence) vecs = parse_lcs_matrix(b, 0, 0, lcs_length, [], []) # first letter of acronym must be present vecs = [vec for vec in vecs if 0 in vec] if not vecs: return ('', confidence) best_vec = vecs[0] for vec in vecs[1:]: best_vec = compare_vectors(best_vec, vec, def_types) first = best_vec.index(int(nanmin(best_vec))) last = best_vec.index(int(nanmax(best_vec))) definition = window[first: last + 1].text if len(definition.split()) == 1: return ('', confidence) return (definition, confidence)
def acronyms_and_definitions(doc, known_acro_defs=None): """ Extract a collection of acronyms and their most likely definitions, if available, from a spacy-parsed doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. Args: doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``) known_acro_defs (dict, optional): if certain acronym/definition pairs are known, pass them in as {acronym (str): definition (str)}; algorithm will not attempt to find new definitions Returns: dict: unique acronyms (keys) with matched definitions (values) References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ # process function arguments acro_defs = defaultdict(list) if not known_acro_defs: known_acronyms = set() else: for acro, defs in known_acro_defs.items(): if not isinstance(defs, list): acro_defs[acro] = [defs] known_acronyms = set(acro_defs.keys()) if isinstance(doc, SpacySpan): sents = [doc] else: # textacy.Doc or spacy.Doc sents = doc.sents # iterate over sentences and their tokens for sent in sents: max_ind = len(sent) - 1 for i, token in enumerate(sent): token_ = token.text if token_ in known_acronyms or text_utils.is_acronym(token_) is False: continue # define definition search window(s) window_size = min(2 * len(token_), len(token_) + 5) windows = [sent[max(i - window_size, 0): i], sent[min(i + 1, max_ind): min(i + window_size + 1, max_ind)]] # if candidate inside (X) or -X-, only look in pre-window if 0 < i < max_ind: adjacent_tokens = sent[i - 1].text + sent[i + 1].text if adjacent_tokens in {'()', '--', '––'}: windows.pop() # iterate over possible windows # filtering for valid definition strings for window in windows: window_ = window.text # window text can't be all uppercase if window_.isupper(): continue # window can't contain separating punctuation if '!' in window_ or '?' in window_ or ':' in window_ or ';' in window_: continue # acronym definition can't contain itself: no ouroboros! if token_ in window_: continue # window must contain at least one character used in acronym if not any(char in window_ for char in token_): continue definition, confidence = _get_acronym_definition( token_, window, threshold=0.8) if definition: acro_defs[token_].append((definition, confidence)) if not acro_defs.get(token_): acro_defs[token_].append(('', 0.0)) # vote by confidence score in the case of multiple definitions for acro, defs in acro_defs.items(): if len(defs) == 1: acro_defs[acro] = defs[0][0] else: acro_defs[acro] = sorted(defs, key=itemgetter(1), reverse=True)[0][0] return dict(acro_defs)
def _get_acronym_definition(acronym, window, threshold=0.8): """ Identify most likely definition for an acronym given a list of tokens. Args: acronym (str): acronym for which definition is sought window (``spacy.Span``): a span of tokens from which definition extraction will be attempted threshold (float, optional): minimum "confidence" in definition required for acceptance; valid values in [0.0, 1.0]; higher value => stricter threshold Returns: (str, float): most likely definition for given acronym ('' if none found), along with the confidence assigned to it References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ def build_lcs_matrix(X, Y): m = len(X) n = len(Y) b = zeros((m, n), dtype=int) c = zeros((m, n), dtype=int) for i in range(0, m): for j in range(0, n): if X[i] == Y[j]: c[i, j] = c[i - 1, j - 1] + 1 b[i, j] = 1 elif c[i - 1, j] >= c[i, j - 1]: c[i, j] = c[i - 1, j] else: c[i, j] = c[i, j - 1] return c, b def parse_lcs_matrix(b, start_i, start_j, lcs_length, stack, vectors): m = b.shape[0] n = b.shape[1] for i in range(start_i, m): for j in range(start_j, n): if b[i, j] == 1: s = (i, j) stack.append(s) if lcs_length == 1: vec = [NaN] * n for k, l in stack: vec[l] = k vectors.append(vec) else: parse_lcs_matrix(b, i + 1, j + 1, lcs_length - 1, stack, vectors) stack = [] return vectors def vector_values(v, types): vv = {} first = v.index(int(nanmin(v))) last = v.index(int(nanmax(v))) vv['size'] = (last - first) + 1 vv['distance'] = len(v) - last vv['stop_count'] = 0 vv['misses'] = 0 for i in range(first, last + 1): if v[i] >= 0 and types[i] == 's': vv['stop_count'] += 1 elif v[i] is None and types[i] not in ['s', 'h']: vv['misses'] += 1 return vv def compare_vectors(A, B, types): vv_A = vector_values(A, types) vv_B = vector_values(B, types) # no one-letter matches, sorryboutit if vv_A['size'] == 1: return B elif vv_B['size'] == 1: return A if vv_A['misses'] > vv_B['misses']: return B elif vv_A['misses'] < vv_B['misses']: return A if vv_A['stop_count'] > vv_B['stop_count']: return B if vv_A['stop_count'] < vv_B['stop_count']: return A if vv_A['distance'] > vv_B['distance']: return B elif vv_A['distance'] < vv_B['distance']: return A if vv_A['size'] > vv_B['size']: return B elif vv_A['size'] < vv_B['size']: return A return A # get definition window's leading characters and word types def_leads = [] def_types = [] for tok in window: tok_text = tok.text if tok.is_stop: def_leads.append(tok_text[0]) def_types.append('s') elif text_utils.is_acronym(tok_text): def_leads.append(tok_text[0]) def_types.append('a') elif '-' in tok_text and not tok_text.startswith('-'): tok_split = [t[0] for t in tok_text.split('-') if t] def_leads.extend(tok_split) def_types.extend('H' if i == 0 else 'h' for i in range(len(tok_split))) else: def_leads.append(tok_text[0]) def_types.append('w') def_leads = ''.join(def_leads).lower() def_types = ''.join(def_types) # extract alphanumeric characters from acronym acr_leads = ''.join(c for c in acronym if c.isalnum()) # handle special cases of '&' and trailing 's' acr_leads = acr_leads.replace('&', 'a') if acr_leads.endswith('s'): # bail out if it's only a 2-letter acronym to start with, e.g. 'Is' if len(acr_leads) == 2: return ('', 0) acr_leads = acr_leads[:-1] acr_leads = acr_leads.lower() c, b = build_lcs_matrix(acr_leads, def_leads) # 4.4.1 lcs_length = c[c.shape[0] - 1, c.shape[1] - 1] confidence = lcs_length / len(acronym) if confidence < threshold: return ('', confidence) vecs = parse_lcs_matrix(b, 0, 0, lcs_length, [], []) # first letter of acronym must be present vecs = [vec for vec in vecs if 0 in vec] if not vecs: return ('', confidence) best_vec = vecs[0] for vec in vecs[1:]: best_vec = compare_vectors(best_vec, vec, def_types) first = best_vec.index(int(nanmin(best_vec))) last = best_vec.index(int(nanmax(best_vec))) definition = window[first: last + 1].text if len(definition.split()) == 1: return ('', confidence) return (definition, confidence)
def acronyms_and_definitions(doc, known_acro_defs=None): """ Extract a collection of acronyms and their most likely definitions, if available, from a spacy-parsed doc. If multiple definitions are found for a given acronym, only the most frequently occurring definition is returned. Args: doc (``spacy.Doc`` or ``spacy.Span``) known_acro_defs (dict, optional): if certain acronym/definition pairs are known, pass them in as {acronym (str): definition (str)}; algorithm will not attempt to find new definitions Returns: dict: unique acronyms (keys) with matched definitions (values) References: Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions." International Journal on Document Analysis and Recognition 1.4 (1999): 191-198. """ # process function arguments acro_defs = defaultdict(list) if not known_acro_defs: known_acronyms = set() else: for acro, defs in known_acro_defs.items(): if not isinstance(defs, list): acro_defs[acro] = [defs] known_acronyms = set(acro_defs.keys()) try: sents = doc.sents except AttributeError: sents = [doc] # iterate over sentences and their tokens for sent in sents: max_ind = len(sent) - 1 for i, token in enumerate(sent): token_ = token.text if token_ in known_acronyms or text_utils.is_acronym(token_) is False: continue # define definition search window(s) window_size = min(2 * len(token_), len(token_) + 5) windows = [sent[max(i - window_size, 0): i], sent[min(i + 1, max_ind): min(i + window_size + 1, max_ind)]] # if candidate inside (X) or -X-, only look in pre-window if 0 < i < max_ind: adjacent_tokens = sent[i - 1].text + sent[i + 1].text if adjacent_tokens in {'()', '--', '––'}: _ = windows.pop() # iterate over possible windows # filtering for valid definition strings for window in windows: window_ = window.text # window text can't be all uppercase if window_.isupper(): continue # window can't contain separating punctuation if '!' in window_ or '?' in window_ or ':' in window_ or ';' in window_: continue # acronym definition can't contain itself: no ouroboros! if token_ in window_: continue # window must contain at least one character used in acronym if not any(char in window_ for char in token_): continue definition, confidence = _get_acronym_definition( token_, window, threshold=0.8) if definition: acro_defs[token_].append((definition, confidence)) if not acro_defs.get(token_): acro_defs[token_].append(('', 0.0)) # vote by confidence score in the case of multiple definitions for acro, defs in acro_defs.items(): if len(defs) == 1: acro_defs[acro] = defs[0][0] else: acro_defs[acro] = sorted(defs, key=itemgetter(1), reverse=True)[0][0] return dict(acro_defs)
def test_is_acronym_exclude(self): self.assertFalse(text_utils.is_acronym('NASA', exclude={'NASA'}))
def test_is_acronym_bad(self): for item in BAD_ACRONYMS: self.assertFalse(text_utils.is_acronym(item))
def test_is_acronym_good(self): for item in GOOD_ACRONYMS: self.assertTrue(text_utils.is_acronym(item))
def test_is_acronym_bad(token): assert not text_utils.is_acronym(token)
def test_is_acronym_exclude(): assert not text_utils.is_acronym("NASA", exclude={"NASA"})
def test_is_acronym_bad(): for item in BAD_ACRONYMS: assert not text_utils.is_acronym(item)
def test_is_acronym_good(): for item in GOOD_ACRONYMS: assert text_utils.is_acronym(item)
def test_is_acronym_exclude(): assert not text_utils.is_acronym('NASA', exclude={'NASA'})
def test_is_acronym_exclude(self): self.assertFalse(text_utils.is_acronym('NASA', exclude={'NASA'}))
def test_is_acronym_bad(self): for item in BAD_ACRONYMS: self.assertFalse(text_utils.is_acronym(item))
def test_is_acronym_good(self): for item in GOOD_ACRONYMS: self.assertTrue(text_utils.is_acronym(item))
def test_is_acronym_good(token): assert text_utils.is_acronym(token)
def test_is_acronym_exclude(token, exclude, expected): assert text_utils.is_acronym(token, exclude=exclude) == expected