Esempio n. 1
0
def preserve_case(token):
    """
    Returns True if `token` is a proper noun or acronym, False otherwise.

    Args:
        token (``spacy.Token``): parent document must have POS information

    Returns:
        bool
    """
    if token.doc.is_tagged is False:
        raise ValueError('token is not POS-tagged')
    return token.pos == PROPN or is_acronym(token.text)
Esempio n. 2
0
def preserve_case(token):
    """
    Returns True if `token` is a proper noun or acronym, False otherwise.

    Args:
        token (``spacy.Token``): parent document must have POS information

    Returns:
        bool
    """
    if token.doc.is_tagged is False:
        raise ValueError('token is not POS-tagged')
    return token.pos == PROPN or is_acronym(token.text)
Esempio n. 3
0
def preserve_case(token):
    """
    Returns True if `token` is a proper noun or acronym, False otherwise.

    Args:
        token (``spacy.Token``): parent document must have POS information

    Returns:
        bool

    TODO: use universal pos PROPN instead of english-specific tags as soon as
    Honnibal decides to include them in his model...
    """
    if token.doc.is_tagged is False:
        raise ValueError('token is not POS-tagged')
    return token.tag_ in {'NNP', 'NNPS'} or is_acronym(token.text)
Esempio n. 4
0
def _get_acronym_definition(acronym, window, threshold=0.8):
    """
    Identify most likely definition for an acronym given a list of tokens.

    Args:
        acronym (str): acronym for which definition is sought
        window (``spacy.Span``): a span of tokens from which definition
            extraction will be attempted
        threshold (float, optional): minimum "confidence" in definition required
            for acceptance; valid values in [0.0, 1.0]; higher value => stricter threshold

    Returns:
        Tuple[str, float]: most likely definition for given acronym ('' if none found),
            along with the confidence assigned to it

    References:
        Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions."
            International Journal on Document Analysis and Recognition 1.4 (1999): 191-198.
    """
    def build_lcs_matrix(X, Y):
        m = len(X)
        n = len(Y)
        b = zeros((m, n), dtype=int)
        c = zeros((m, n), dtype=int)
        for i in range(0, m):
            for j in range(0, n):
                if X[i] == Y[j]:
                    c[i, j] = c[i - 1, j - 1] + 1
                    b[i, j] = 1
                elif c[i - 1, j] >= c[i, j - 1]:
                    c[i, j] = c[i - 1, j]
                else:
                    c[i, j] = c[i, j - 1]
        return c, b

    def parse_lcs_matrix(b, start_i, start_j, lcs_length, stack, vectors):
        m = b.shape[0]
        n = b.shape[1]
        for i in range(start_i, m):
            for j in range(start_j, n):
                if b[i, j] == 1:
                    s = (i, j)
                    stack.append(s)
                    if lcs_length == 1:
                        vec = [NaN] * n
                        for k, l in stack:
                            vec[l] = k
                        vectors.append(vec)
                    else:
                        parse_lcs_matrix(b, i + 1, j + 1, lcs_length - 1, stack, vectors)
                    stack = []
        return vectors

    def vector_values(v, types):
        vv = {}
        first = v.index(int(nanmin(v)))
        last = v.index(int(nanmax(v)))
        vv['size'] = (last - first) + 1
        vv['distance'] = len(v) - last
        vv['stop_count'] = 0
        vv['misses'] = 0
        for i in range(first, last + 1):
            if v[i] >= 0 and types[i] == 's':
                vv['stop_count'] += 1
            elif v[i] is None and types[i] not in ['s', 'h']:
                vv['misses'] += 1
        return vv

    def compare_vectors(A, B, types):
        vv_A = vector_values(A, types)
        vv_B = vector_values(B, types)
        # no one-letter matches, sorryboutit
        if vv_A['size'] == 1:
            return B
        elif vv_B['size'] == 1:
            return A
        if vv_A['misses'] > vv_B['misses']:
            return B
        elif vv_A['misses'] < vv_B['misses']:
            return A
        if vv_A['stop_count'] > vv_B['stop_count']:
            return B
        if vv_A['stop_count'] < vv_B['stop_count']:
            return A
        if vv_A['distance'] > vv_B['distance']:
            return B
        elif vv_A['distance'] < vv_B['distance']:
            return A
        if vv_A['size'] > vv_B['size']:
            return B
        elif vv_A['size'] < vv_B['size']:
            return A
        return A

    # get definition window's leading characters and word types
    def_leads = []
    def_types = []
    for tok in window:
        tok_text = tok.text
        if tok.is_stop:
            def_leads.append(tok_text[0])
            def_types.append('s')
        elif text_utils.is_acronym(tok_text):
            def_leads.append(tok_text[0])
            def_types.append('a')
        elif '-' in tok_text and not tok_text.startswith('-'):
            tok_split = [t[0] for t in tok_text.split('-') if t]
            def_leads.extend(tok_split)
            def_types.extend('H' if i == 0 else 'h' for i in range(len(tok_split)))
        else:
            def_leads.append(tok_text[0])
            def_types.append('w')
    def_leads = ''.join(def_leads).lower()
    def_types = ''.join(def_types)

    # extract alphanumeric characters from acronym
    acr_leads = ''.join(c for c in acronym if c.isalnum())
    # handle special cases of '&' and trailing 's'
    acr_leads = acr_leads.replace('&', 'a')
    if acr_leads.endswith('s'):
        # bail out if it's only a 2-letter acronym to start with, e.g. 'Is'
        if len(acr_leads) == 2:
            return ('', 0)
        acr_leads = acr_leads[:-1]
    acr_leads = acr_leads.lower()

    c, b = build_lcs_matrix(acr_leads, def_leads)

    # 4.4.1
    lcs_length = c[c.shape[0] - 1, c.shape[1] - 1]
    confidence = lcs_length / len(acronym)
    if confidence < threshold:
        return ('', confidence)

    vecs = parse_lcs_matrix(b, 0, 0, lcs_length, [], [])
    # first letter of acronym must be present
    vecs = [vec for vec in vecs if 0 in vec]
    if not vecs:
        return ('', confidence)

    best_vec = vecs[0]
    for vec in vecs[1:]:
        best_vec = compare_vectors(best_vec, vec, def_types)

    first = best_vec.index(int(nanmin(best_vec)))
    last = best_vec.index(int(nanmax(best_vec)))

    definition = window[first: last + 1].text
    if len(definition.split()) == 1:
        return ('', confidence)

    return (definition, confidence)
Esempio n. 5
0
def acronyms_and_definitions(doc, known_acro_defs=None):
    """
    Extract a collection of acronyms and their most likely definitions, if available,
    from a spacy-parsed doc. If multiple definitions are found for a given acronym,
    only the most frequently occurring definition is returned.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)
        known_acro_defs (dict, optional): if certain acronym/definition pairs
            are known, pass them in as {acronym (str): definition (str)};
            algorithm will not attempt to find new definitions

    Returns:
        dict: unique acronyms (keys) with matched definitions (values)

    References:
        Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions."
            International Journal on Document Analysis and Recognition 1.4 (1999): 191-198.
    """
    # process function arguments
    acro_defs = defaultdict(list)
    if not known_acro_defs:
        known_acronyms = set()
    else:
        for acro, defs in known_acro_defs.items():
            if not isinstance(defs, list):
                acro_defs[acro] = [defs]
        known_acronyms = set(acro_defs.keys())

    if isinstance(doc, SpacySpan):
        sents = [doc]
    else:  # textacy.Doc or spacy.Doc
        sents = doc.sents

    # iterate over sentences and their tokens
    for sent in sents:
        max_ind = len(sent) - 1

        for i, token in enumerate(sent):

            token_ = token.text
            if token_ in known_acronyms or text_utils.is_acronym(token_) is False:
                continue

            # define definition search window(s)
            window_size = min(2 * len(token_), len(token_) + 5)
            windows = [sent[max(i - window_size, 0): i],
                       sent[min(i + 1, max_ind): min(i + window_size + 1, max_ind)]]
            # if candidate inside (X) or -X-, only look in pre-window
            if 0 < i < max_ind:
                adjacent_tokens = sent[i - 1].text + sent[i + 1].text
                if adjacent_tokens in {'()', '--', '––'}:
                    windows.pop()

            # iterate over possible windows
            # filtering for valid definition strings
            for window in windows:
                window_ = window.text
                # window text can't be all uppercase
                if window_.isupper():
                    continue
                # window can't contain separating punctuation
                if '!' in window_ or '?' in window_ or ':' in window_ or ';' in window_:
                    continue
                # acronym definition can't contain itself: no ouroboros!
                if token_ in window_:
                    continue
                # window must contain at least one character used in acronym
                if not any(char in window_ for char in token_):
                    continue
                definition, confidence = _get_acronym_definition(
                    token_, window, threshold=0.8)
                if definition:
                    acro_defs[token_].append((definition, confidence))

            if not acro_defs.get(token_):
                acro_defs[token_].append(('', 0.0))

    # vote by confidence score in the case of multiple definitions
    for acro, defs in acro_defs.items():
        if len(defs) == 1:
            acro_defs[acro] = defs[0][0]
        else:
            acro_defs[acro] = sorted(defs, key=itemgetter(1), reverse=True)[0][0]

    return dict(acro_defs)
Esempio n. 6
0
def _get_acronym_definition(acronym, window, threshold=0.8):
    """
    Identify most likely definition for an acronym given a list of tokens.

    Args:
        acronym (str): acronym for which definition is sought
        window (``spacy.Span``): a span of tokens from which definition
            extraction will be attempted
        threshold (float, optional): minimum "confidence" in definition required
            for acceptance; valid values in [0.0, 1.0]; higher value => stricter threshold

    Returns:
        (str, float): most likely definition for given acronym ('' if none found),
            along with the confidence assigned to it

    References:
        Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions."
            International Journal on Document Analysis and Recognition 1.4 (1999): 191-198.
    """
    def build_lcs_matrix(X, Y):
        m = len(X)
        n = len(Y)
        b = zeros((m, n), dtype=int)
        c = zeros((m, n), dtype=int)
        for i in range(0, m):
            for j in range(0, n):
                if X[i] == Y[j]:
                    c[i, j] = c[i - 1, j - 1] + 1
                    b[i, j] = 1
                elif c[i - 1, j] >= c[i, j - 1]:
                    c[i, j] = c[i - 1, j]
                else:
                    c[i, j] = c[i, j - 1]
        return c, b

    def parse_lcs_matrix(b, start_i, start_j, lcs_length, stack, vectors):
        m = b.shape[0]
        n = b.shape[1]
        for i in range(start_i, m):
            for j in range(start_j, n):
                if b[i, j] == 1:
                    s = (i, j)
                    stack.append(s)
                    if lcs_length == 1:
                        vec = [NaN] * n
                        for k, l in stack:
                            vec[l] = k
                        vectors.append(vec)
                    else:
                        parse_lcs_matrix(b, i + 1, j + 1, lcs_length - 1, stack, vectors)
                    stack = []
        return vectors

    def vector_values(v, types):
        vv = {}
        first = v.index(int(nanmin(v)))
        last = v.index(int(nanmax(v)))
        vv['size'] = (last - first) + 1
        vv['distance'] = len(v) - last
        vv['stop_count'] = 0
        vv['misses'] = 0
        for i in range(first, last + 1):
            if v[i] >= 0 and types[i] == 's':
                vv['stop_count'] += 1
            elif v[i] is None and types[i] not in ['s', 'h']:
                vv['misses'] += 1
        return vv

    def compare_vectors(A, B, types):
        vv_A = vector_values(A, types)
        vv_B = vector_values(B, types)
        # no one-letter matches, sorryboutit
        if vv_A['size'] == 1:
            return B
        elif vv_B['size'] == 1:
            return A
        if vv_A['misses'] > vv_B['misses']:
            return B
        elif vv_A['misses'] < vv_B['misses']:
            return A
        if vv_A['stop_count'] > vv_B['stop_count']:
            return B
        if vv_A['stop_count'] < vv_B['stop_count']:
            return A
        if vv_A['distance'] > vv_B['distance']:
            return B
        elif vv_A['distance'] < vv_B['distance']:
            return A
        if vv_A['size'] > vv_B['size']:
            return B
        elif vv_A['size'] < vv_B['size']:
            return A
        return A

    # get definition window's leading characters and word types
    def_leads = []
    def_types = []
    for tok in window:
        tok_text = tok.text
        if tok.is_stop:
            def_leads.append(tok_text[0])
            def_types.append('s')
        elif text_utils.is_acronym(tok_text):
            def_leads.append(tok_text[0])
            def_types.append('a')
        elif '-' in tok_text and not tok_text.startswith('-'):
            tok_split = [t[0] for t in tok_text.split('-') if t]
            def_leads.extend(tok_split)
            def_types.extend('H' if i == 0 else 'h' for i in range(len(tok_split)))
        else:
            def_leads.append(tok_text[0])
            def_types.append('w')
    def_leads = ''.join(def_leads).lower()
    def_types = ''.join(def_types)

    # extract alphanumeric characters from acronym
    acr_leads = ''.join(c for c in acronym if c.isalnum())
    # handle special cases of '&' and trailing 's'
    acr_leads = acr_leads.replace('&', 'a')
    if acr_leads.endswith('s'):
        # bail out if it's only a 2-letter acronym to start with, e.g. 'Is'
        if len(acr_leads) == 2:
            return ('', 0)
        acr_leads = acr_leads[:-1]
    acr_leads = acr_leads.lower()

    c, b = build_lcs_matrix(acr_leads, def_leads)

    # 4.4.1
    lcs_length = c[c.shape[0] - 1, c.shape[1] - 1]
    confidence = lcs_length / len(acronym)
    if confidence < threshold:
        return ('', confidence)

    vecs = parse_lcs_matrix(b, 0, 0, lcs_length, [], [])
    # first letter of acronym must be present
    vecs = [vec for vec in vecs if 0 in vec]
    if not vecs:
        return ('', confidence)

    best_vec = vecs[0]
    for vec in vecs[1:]:
        best_vec = compare_vectors(best_vec, vec, def_types)

    first = best_vec.index(int(nanmin(best_vec)))
    last = best_vec.index(int(nanmax(best_vec)))

    definition = window[first: last + 1].text
    if len(definition.split()) == 1:
        return ('', confidence)

    return (definition, confidence)
Esempio n. 7
0
def acronyms_and_definitions(doc, known_acro_defs=None):
    """
    Extract a collection of acronyms and their most likely definitions, if available,
    from a spacy-parsed doc. If multiple definitions are found for a given acronym,
    only the most frequently occurring definition is returned.

    Args:
        doc (``spacy.Doc`` or ``spacy.Span``)
        known_acro_defs (dict, optional): if certain acronym/definition pairs
            are known, pass them in as {acronym (str): definition (str)};
            algorithm will not attempt to find new definitions

    Returns:
        dict: unique acronyms (keys) with matched definitions (values)

    References:
        Taghva, Kazem, and Jeff Gilbreth. "Recognizing acronyms and their definitions."
            International Journal on Document Analysis and Recognition 1.4 (1999): 191-198.
    """
    # process function arguments
    acro_defs = defaultdict(list)
    if not known_acro_defs:
        known_acronyms = set()
    else:
        for acro, defs in known_acro_defs.items():
            if not isinstance(defs, list):
                acro_defs[acro] = [defs]
        known_acronyms = set(acro_defs.keys())

    try:
        sents = doc.sents
    except AttributeError:
        sents = [doc]
    # iterate over sentences and their tokens
    for sent in sents:
        max_ind = len(sent) - 1

        for i, token in enumerate(sent):

            token_ = token.text
            if token_ in known_acronyms or text_utils.is_acronym(token_) is False:
                continue

            # define definition search window(s)
            window_size = min(2 * len(token_), len(token_) + 5)
            windows = [sent[max(i - window_size, 0): i],
                       sent[min(i + 1, max_ind): min(i + window_size + 1, max_ind)]]
            # if candidate inside (X) or -X-, only look in pre-window
            if 0 < i < max_ind:
                adjacent_tokens = sent[i - 1].text + sent[i + 1].text
                if adjacent_tokens in {'()', '--', '––'}:
                    _ = windows.pop()

            # iterate over possible windows
            # filtering for valid definition strings
            for window in windows:
                window_ = window.text
                # window text can't be all uppercase
                if window_.isupper():
                    continue
                # window can't contain separating punctuation
                if '!' in window_ or '?' in window_ or ':' in window_ or ';' in window_:
                    continue
                # acronym definition can't contain itself: no ouroboros!
                if token_ in window_:
                    continue
                # window must contain at least one character used in acronym
                if not any(char in window_ for char in token_):
                    continue
                definition, confidence = _get_acronym_definition(
                    token_, window, threshold=0.8)
                if definition:
                    acro_defs[token_].append((definition, confidence))

            if not acro_defs.get(token_):
                acro_defs[token_].append(('', 0.0))

    # vote by confidence score in the case of multiple definitions
    for acro, defs in acro_defs.items():
        if len(defs) == 1:
            acro_defs[acro] = defs[0][0]
        else:
            acro_defs[acro] = sorted(defs, key=itemgetter(1), reverse=True)[0][0]

    return dict(acro_defs)
Esempio n. 8
0
 def test_is_acronym_exclude(self):
     self.assertFalse(text_utils.is_acronym('NASA', exclude={'NASA'}))
Esempio n. 9
0
 def test_is_acronym_bad(self):
     for item in BAD_ACRONYMS:
         self.assertFalse(text_utils.is_acronym(item))
Esempio n. 10
0
 def test_is_acronym_good(self):
     for item in GOOD_ACRONYMS:
         self.assertTrue(text_utils.is_acronym(item))
Esempio n. 11
0
def test_is_acronym_bad(token):
    assert not text_utils.is_acronym(token)
Esempio n. 12
0
def test_is_acronym_exclude():
    assert not text_utils.is_acronym("NASA", exclude={"NASA"})
Esempio n. 13
0
def test_is_acronym_bad():
    for item in BAD_ACRONYMS:
        assert not text_utils.is_acronym(item)
Esempio n. 14
0
def test_is_acronym_good():
    for item in GOOD_ACRONYMS:
        assert text_utils.is_acronym(item)
Esempio n. 15
0
def test_is_acronym_exclude():
    assert not text_utils.is_acronym('NASA', exclude={'NASA'})
Esempio n. 16
0
 def test_is_acronym_exclude(self):
     self.assertFalse(text_utils.is_acronym('NASA', exclude={'NASA'}))
Esempio n. 17
0
 def test_is_acronym_bad(self):
     for item in BAD_ACRONYMS:
         self.assertFalse(text_utils.is_acronym(item))
Esempio n. 18
0
 def test_is_acronym_good(self):
     for item in GOOD_ACRONYMS:
         self.assertTrue(text_utils.is_acronym(item))
Esempio n. 19
0
def test_is_acronym_good(token):
    assert text_utils.is_acronym(token)
Esempio n. 20
0
def test_is_acronym_exclude(token, exclude, expected):
    assert text_utils.is_acronym(token, exclude=exclude) == expected