예제 #1
0
def test_write_results(ngramstat):
    filename = "tests/resources/test.results"
    topics = [
        Acronym(acronym='EKG', left_context='', right_context=''),
        Acronym(acronym='AP', left_context='', right_context='')
    ]
    expansion_standard.write(filename, {"EKG": {}}, {'EKG', 'AP'}, topics)
    assert os.path.exists(filename)
    assert os.path.getsize(filename) > 10
예제 #2
0
def _generate_ngram_contexts(ngram: str) -> 'List[Acronym]':
    """
    Generate a list of contextualized n-grams with a decreasing central n-gram and increasing \
    lateral context.

    :param ngram:
    :return: 
    """
    tokens = ngram.split(" ")
    ngram_size = len(tokens)

    contexts = []
    # Walk only until half and `max_diff` more.
    for i in range(0, int((ngram_size + 1 + MAX_DIFF) / 2)):
        # Allow up to `max_diff` difference in size.
        for j in range(ngram_size - i + MAX_DIFF,
                       ngram_size - i - MAX_DIFF - 1, -1):
            # Do not allow empty acronym.
            if i >= j:
                break
            # Do not walk past the n-gram.
            if j > ngram_size:
                continue
            left = sys.intern(" ".join(tokens[0:i]))
            right = sys.intern(" ".join(tokens[j:ngram_size]))
            center = sys.intern(" ".join(tokens[i:j]))
            contexts.append(
                Acronym(acronym=center, left_context=left,
                        right_context=right))
    return contexts
예제 #3
0
def _find_contexts(acronym: str, min_freq: int) -> 'List[Acronym]':
    """
    Find contexts in the training data where this acronym appears.

    :param acronym:
    :param min_freq:
    :return:
    """
    model = resource_factory.get_center_map(
        functions.partition(acronym, PARTITIONS))

    all_contexts = []  # type: List[Acronym]
    for out_freq, contexts in model.contexts(acronym).items():
        for left, right in contexts:
            # Do not allow empty contexts.
            if left == '' and right == '':
                continue
            if out_freq < min_freq:
                break
            contextualized_acronym = Acronym(acronym=acronym,
                                             left_context=left,
                                             right_context=right)
            all_contexts.append(contextualized_acronym)

    return all_contexts
예제 #4
0
def _generate_acronym_contexts(
        contextualized_acronym: 'Acronym') -> 'List[Acronym]':
    """
    Generate a list of contextualized acronyms with decreasing lateral context.

    Right context is deemed more important than left context, e.g. EF 00%, HF 000/min,
    so we generate first longer right n-grams, e.g. (left_bigram, right_trigram).

    @todo default parameter min_length = 0, so that we avoid empty contexts if we want.

    :param contextualized_acronym:
    :return:
    """
    left = contextualized_acronym.left_context.split()
    right = contextualized_acronym.right_context.split()
    left_length = len(left)
    right_length = len(right)

    # We allow up to MAX_DIFF difference in context size iff the right context is larger than left.
    max_length = min(left_length, right_length)
    if right_length > left_length:
        max_length += min(MAX_DIFF, right_length - left_length)

    contexts = []  # type: List[Acronym]
    for j in range(max_length, -1, -1):
        # Left size > right size
        if j > right_length:
            continue
        for i in range(left_length - j - MAX_DIFF,
                       left_length - j + MAX_DIFF + 1):
            # Prevents double empty context on last iteration
            if i > left_length:
                break
            # Left size < right size
            if i < 0:
                continue
            left_context = " ".join(left[i:left_length])
            right_context = " ".join(right[0:j])
            contexts.append(
                Acronym(acronym=contextualized_acronym.acronym,
                        left_context=left_context,
                        right_context=right_context))
    return contexts
예제 #5
0
def fastngram(acronym: str,
              left_context: str = "",
              right_context: str = "",
              min_freq: int = 2,
              max_rank: int = 100000) -> Iterator[str]:
    """
    Find an unlimited set of expansion candidates for an acronym given its left and right context. \
    Note that no filtering is done here, except from the acronym initial partioning.

    :param acronym:
    :param left_context:
    :param right_context:
    :param min_freq:
    :param max_rank:
    :return:
    """
    contextualized_acronym = Acronym(acronym=acronym,
                                     left_context=left_context,
                                     right_context=right_context)
    contexts = _generate_acronym_contexts(contextualized_acronym)

    for ngram in _center_provider(contexts, min_freq, max_rank):
        yield ngram
예제 #6
0
def test_update():
    acronym = Acronym(acronym='AP', left_context='', right_context='')
    actual = detection_standard.update({'EKG': False}, [acronym])
    assert actual == {'EKG': False, 'AP': True}
예제 #7
0
def test_filter_acronym_contexts():
    sentences = [['Hello', 'my', 'world'], ['performed', 'EKG', 'yesterday']]
    actual = list(islice(ngrams.filter_acronym_contexts(sentences), 100))
    assert actual == [Acronym(acronym='EKG', left_context='performed', right_context='yesterday')]
예제 #8
0
def test__generate_acronym_contexts():
    fastngram.MAX_DIFF = 1

    # Baseline
    expected = [
        Acronym(left_context='a b c', acronym='d', right_context='e f g'),
        Acronym(left_context='b c', acronym='d', right_context='e f g'),
        Acronym(left_context='a b c', acronym='d', right_context='e f'),
        Acronym(left_context='b c', acronym='d', right_context='e f'),
        Acronym(left_context='c', acronym='d', right_context='e f'),
        Acronym(left_context='b c', acronym='d', right_context='e'),
        Acronym(left_context='c', acronym='d', right_context='e'),
        Acronym(left_context='', acronym='d', right_context='e'),
        Acronym(left_context='c', acronym='d', right_context=''),
        Acronym(left_context='', acronym='d', right_context='')
    ]
    acronym = Acronym(left_context='a b c', acronym='d', right_context='e f g')
    assert fastngram._generate_acronym_contexts(acronym) == expected

    # Empty context
    expected = [Acronym(left_context='', acronym='a', right_context='')]
    acronym = Acronym(left_context='', acronym='a', right_context='')
    assert fastngram._generate_acronym_contexts(acronym) == expected

    # Longer left context
    expected = [
        Acronym(left_context='b c', acronym='d', right_context='e'),
        Acronym(left_context='c', acronym='d', right_context='e'),
        Acronym(left_context='', acronym='d', right_context='e'),
        Acronym(left_context='c', acronym='d', right_context=''),
        Acronym(left_context='', acronym='d', right_context='')
    ]
    acronym = Acronym(left_context='a b c', acronym='d', right_context='e')
    assert fastngram._generate_acronym_contexts(acronym) == expected

    # Longer right context
    expected = [
        Acronym(left_context='a', acronym='b', right_context='c d'),
        Acronym(left_context='a', acronym='b', right_context='c'),
        Acronym(left_context='', acronym='b', right_context='c'),
        Acronym(left_context='a', acronym='b', right_context=''),
        Acronym(left_context='', acronym='b', right_context='')
    ]
    acronym = Acronym(left_context='a', acronym='b', right_context='c d e')
    assert fastngram._generate_acronym_contexts(acronym) == expected
예제 #9
0
def test__generate_ngram_contexts():
    fastngram.MAX_DIFF = 1
    expected = [Acronym(left_context='', acronym='a', right_context='')]
    assert fastngram._generate_ngram_contexts("a") == expected

    expected = [
        Acronym(left_context='', acronym='a b', right_context=''),
        Acronym(left_context='', acronym='a', right_context='b'),
        Acronym(left_context='a', acronym='b', right_context='')
    ]
    assert fastngram._generate_ngram_contexts("a b") == expected

    expected = [
        Acronym(left_context='', acronym='a b c', right_context=''),
        Acronym(left_context='', acronym='a b', right_context='c'),
        Acronym(left_context='a', acronym='b c', right_context=''),
        Acronym(left_context='a', acronym='b', right_context='c')
    ]
    assert fastngram._generate_ngram_contexts("a b c") == expected

    expected = [
        Acronym(left_context='', acronym='a b c d', right_context=''),
        Acronym(left_context='', acronym='a b c', right_context='d'),
        Acronym(left_context='a', acronym='b c d', right_context=''),
        Acronym(left_context='a', acronym='b c', right_context='d'),
        Acronym(left_context='a', acronym='b', right_context='c d'),
        Acronym(left_context='a b', acronym='c', right_context='d')
    ]
    assert fastngram._generate_ngram_contexts("a b c d") == expected

    expected = [
        Acronym(left_context='', acronym='a b c d e', right_context=''),
        Acronym(left_context='', acronym='a b c d', right_context='e'),
        Acronym(left_context='a', acronym='b c d e', right_context=''),
        Acronym(left_context='a', acronym='b c d', right_context='e'),
        Acronym(left_context='a', acronym='b c', right_context='d e'),
        Acronym(left_context='a b', acronym='c d', right_context='e'),
        Acronym(left_context='a b', acronym='c', right_context='d e')
    ]
    assert fastngram._generate_ngram_contexts("a b c d e") == expected