def test_write_results(ngramstat): filename = "tests/resources/test.results" topics = [ Acronym(acronym='EKG', left_context='', right_context=''), Acronym(acronym='AP', left_context='', right_context='') ] expansion_standard.write(filename, {"EKG": {}}, {'EKG', 'AP'}, topics) assert os.path.exists(filename) assert os.path.getsize(filename) > 10
def _generate_ngram_contexts(ngram: str) -> 'List[Acronym]': """ Generate a list of contextualized n-grams with a decreasing central n-gram and increasing \ lateral context. :param ngram: :return: """ tokens = ngram.split(" ") ngram_size = len(tokens) contexts = [] # Walk only until half and `max_diff` more. for i in range(0, int((ngram_size + 1 + MAX_DIFF) / 2)): # Allow up to `max_diff` difference in size. for j in range(ngram_size - i + MAX_DIFF, ngram_size - i - MAX_DIFF - 1, -1): # Do not allow empty acronym. if i >= j: break # Do not walk past the n-gram. if j > ngram_size: continue left = sys.intern(" ".join(tokens[0:i])) right = sys.intern(" ".join(tokens[j:ngram_size])) center = sys.intern(" ".join(tokens[i:j])) contexts.append( Acronym(acronym=center, left_context=left, right_context=right)) return contexts
def _find_contexts(acronym: str, min_freq: int) -> 'List[Acronym]': """ Find contexts in the training data where this acronym appears. :param acronym: :param min_freq: :return: """ model = resource_factory.get_center_map( functions.partition(acronym, PARTITIONS)) all_contexts = [] # type: List[Acronym] for out_freq, contexts in model.contexts(acronym).items(): for left, right in contexts: # Do not allow empty contexts. if left == '' and right == '': continue if out_freq < min_freq: break contextualized_acronym = Acronym(acronym=acronym, left_context=left, right_context=right) all_contexts.append(contextualized_acronym) return all_contexts
def _generate_acronym_contexts( contextualized_acronym: 'Acronym') -> 'List[Acronym]': """ Generate a list of contextualized acronyms with decreasing lateral context. Right context is deemed more important than left context, e.g. EF 00%, HF 000/min, so we generate first longer right n-grams, e.g. (left_bigram, right_trigram). @todo default parameter min_length = 0, so that we avoid empty contexts if we want. :param contextualized_acronym: :return: """ left = contextualized_acronym.left_context.split() right = contextualized_acronym.right_context.split() left_length = len(left) right_length = len(right) # We allow up to MAX_DIFF difference in context size iff the right context is larger than left. max_length = min(left_length, right_length) if right_length > left_length: max_length += min(MAX_DIFF, right_length - left_length) contexts = [] # type: List[Acronym] for j in range(max_length, -1, -1): # Left size > right size if j > right_length: continue for i in range(left_length - j - MAX_DIFF, left_length - j + MAX_DIFF + 1): # Prevents double empty context on last iteration if i > left_length: break # Left size < right size if i < 0: continue left_context = " ".join(left[i:left_length]) right_context = " ".join(right[0:j]) contexts.append( Acronym(acronym=contextualized_acronym.acronym, left_context=left_context, right_context=right_context)) return contexts
def fastngram(acronym: str, left_context: str = "", right_context: str = "", min_freq: int = 2, max_rank: int = 100000) -> Iterator[str]: """ Find an unlimited set of expansion candidates for an acronym given its left and right context. \ Note that no filtering is done here, except from the acronym initial partioning. :param acronym: :param left_context: :param right_context: :param min_freq: :param max_rank: :return: """ contextualized_acronym = Acronym(acronym=acronym, left_context=left_context, right_context=right_context) contexts = _generate_acronym_contexts(contextualized_acronym) for ngram in _center_provider(contexts, min_freq, max_rank): yield ngram
def test_update(): acronym = Acronym(acronym='AP', left_context='', right_context='') actual = detection_standard.update({'EKG': False}, [acronym]) assert actual == {'EKG': False, 'AP': True}
def test_filter_acronym_contexts(): sentences = [['Hello', 'my', 'world'], ['performed', 'EKG', 'yesterday']] actual = list(islice(ngrams.filter_acronym_contexts(sentences), 100)) assert actual == [Acronym(acronym='EKG', left_context='performed', right_context='yesterday')]
def test__generate_acronym_contexts(): fastngram.MAX_DIFF = 1 # Baseline expected = [ Acronym(left_context='a b c', acronym='d', right_context='e f g'), Acronym(left_context='b c', acronym='d', right_context='e f g'), Acronym(left_context='a b c', acronym='d', right_context='e f'), Acronym(left_context='b c', acronym='d', right_context='e f'), Acronym(left_context='c', acronym='d', right_context='e f'), Acronym(left_context='b c', acronym='d', right_context='e'), Acronym(left_context='c', acronym='d', right_context='e'), Acronym(left_context='', acronym='d', right_context='e'), Acronym(left_context='c', acronym='d', right_context=''), Acronym(left_context='', acronym='d', right_context='') ] acronym = Acronym(left_context='a b c', acronym='d', right_context='e f g') assert fastngram._generate_acronym_contexts(acronym) == expected # Empty context expected = [Acronym(left_context='', acronym='a', right_context='')] acronym = Acronym(left_context='', acronym='a', right_context='') assert fastngram._generate_acronym_contexts(acronym) == expected # Longer left context expected = [ Acronym(left_context='b c', acronym='d', right_context='e'), Acronym(left_context='c', acronym='d', right_context='e'), Acronym(left_context='', acronym='d', right_context='e'), Acronym(left_context='c', acronym='d', right_context=''), Acronym(left_context='', acronym='d', right_context='') ] acronym = Acronym(left_context='a b c', acronym='d', right_context='e') assert fastngram._generate_acronym_contexts(acronym) == expected # Longer right context expected = [ Acronym(left_context='a', acronym='b', right_context='c d'), Acronym(left_context='a', acronym='b', right_context='c'), Acronym(left_context='', acronym='b', right_context='c'), Acronym(left_context='a', acronym='b', right_context=''), Acronym(left_context='', acronym='b', right_context='') ] acronym = Acronym(left_context='a', acronym='b', right_context='c d e') assert fastngram._generate_acronym_contexts(acronym) == expected
def test__generate_ngram_contexts(): fastngram.MAX_DIFF = 1 expected = [Acronym(left_context='', acronym='a', right_context='')] assert fastngram._generate_ngram_contexts("a") == expected expected = [ Acronym(left_context='', acronym='a b', right_context=''), Acronym(left_context='', acronym='a', right_context='b'), Acronym(left_context='a', acronym='b', right_context='') ] assert fastngram._generate_ngram_contexts("a b") == expected expected = [ Acronym(left_context='', acronym='a b c', right_context=''), Acronym(left_context='', acronym='a b', right_context='c'), Acronym(left_context='a', acronym='b c', right_context=''), Acronym(left_context='a', acronym='b', right_context='c') ] assert fastngram._generate_ngram_contexts("a b c") == expected expected = [ Acronym(left_context='', acronym='a b c d', right_context=''), Acronym(left_context='', acronym='a b c', right_context='d'), Acronym(left_context='a', acronym='b c d', right_context=''), Acronym(left_context='a', acronym='b c', right_context='d'), Acronym(left_context='a', acronym='b', right_context='c d'), Acronym(left_context='a b', acronym='c', right_context='d') ] assert fastngram._generate_ngram_contexts("a b c d") == expected expected = [ Acronym(left_context='', acronym='a b c d e', right_context=''), Acronym(left_context='', acronym='a b c d', right_context='e'), Acronym(left_context='a', acronym='b c d e', right_context=''), Acronym(left_context='a', acronym='b c d', right_context='e'), Acronym(left_context='a', acronym='b c', right_context='d e'), Acronym(left_context='a b', acronym='c d', right_context='e'), Acronym(left_context='a b', acronym='c', right_context='d e') ] assert fastngram._generate_ngram_contexts("a b c d e") == expected