Example #1
0
def test_compute_lattice_expected_counts_incomplete_arc():
    # 0 --0-> 1, -1
    with raises(ValueError):
        compute_lattice_expected_counts(
            TokenLattice(arcList=[
                Arc(dst=1, token=Token(tokenIndex=0), weight=-1.),
            ],
                         startState=0,
                         endState=1))
    with raises(ValueError):
        compute_lattice_expected_counts(
            TokenLattice(arcList=[
                Arc(src=0, token=Token(tokenIndex=0), weight=-1.),
            ],
                         startState=0,
                         endState=1))
    with raises(ValueError):
        compute_lattice_expected_counts(
            TokenLattice(arcList=[
                Arc(src=0, dst=1, weight=-1.),
            ],
                         startState=0,
                         endState=1))
    with raises(ValueError):
        compute_lattice_expected_counts(
            TokenLattice(arcList=[
                Arc(src=0, dst=1, token=Token(tokenIndex=0)),
            ],
                         startState=0,
                         endState=1))
Example #2
0
def test_compute_lattice_expected_counts_two_serial_arcs():
    # 0 --0-> 1, -1
    # 1 --1-> 2, -3
    expected = [0., 0.]
    actual = compute_lattice_expected_counts(
        TokenLattice(arcList=[
            Arc(src=0, dst=1, token=Token(tokenIndex=0), weight=-1.),
            Arc(src=1, dst=2, token=Token(tokenIndex=1), weight=-3.),
        ],
                     startState=0,
                     endState=2))
    assert allclose(expected, actual), '%s !~= %s' % (expected, actual)
Example #3
0
def test_lattice_with_token_list_kind():
    comm = create_comm('comm-1', 'mambo no. 4')
    tokenization = comm.sectionList[0].sentenceList[0].tokenization
    lattice_path = LatticePath()
    lattice_path.tokenList = [
        Token(tokenIndex=0, text='mambo'),
        Token(tokenIndex=0, text='no.'),
        Token(tokenIndex=0, text='3')
    ]
    token_lattice = TokenLattice()
    token_lattice.cachedBestPath = lattice_path
    tokenization.lattice = token_lattice
    token_texts = [t.text for t in get_tokens(tokenization)]
    assert ['mambo', 'no.', '4'] == token_texts
Example #4
0
def test_compute_lattice_expected_counts_triangle_arbitrary_states():
    # 47 --0-> 9, -1
    #  9 --1-> 3, -2
    # 47 --1-> 3, -4
    A = log(exp(-1 + -2) + exp(-4))
    expected = [-1 + -2 - A, 0.]
    actual = compute_lattice_expected_counts(
        TokenLattice(arcList=[
            Arc(src=47, dst=9, token=Token(tokenIndex=0), weight=-1.),
            Arc(src=9, dst=3, token=Token(tokenIndex=1), weight=-2.),
            Arc(src=47, dst=3, token=Token(tokenIndex=1), weight=-4.),
        ],
                     startState=47,
                     endState=3))
    assert allclose(expected, actual), '%s !~= %s' % (expected, actual)
Example #5
0
def test_compute_lattice_expected_counts_triangle():
    # 0 --0-> 1, -1
    # 1 --1-> 2, -2
    # 0 --1-> 2, -4
    A = log(exp(-1 + -2) + exp(-4))
    expected = [-1 + -2 - A, 0.]
    actual = compute_lattice_expected_counts(
        TokenLattice(arcList=[
            Arc(src=0, dst=1, token=Token(tokenIndex=0), weight=-1.),
            Arc(src=1, dst=2, token=Token(tokenIndex=1), weight=-2.),
            Arc(src=0, dst=2, token=Token(tokenIndex=1), weight=-4.),
        ],
                     startState=0,
                     endState=2))
    assert allclose(expected, actual), '%s !~= %s' % (expected, actual)
def create_sentence(sen_text, sen_start, sen_end, aug, metadata_tool,
                    metadata_timestamp, annotation_level):
    '''
    Create sentence from provided text and metadata.
    Lower-level routine (called indirectly by create_comm).
    '''

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)
    sentences = sections and (annotation_level != AL_SECTION)
    tokens = sentences and (annotation_level != AL_SENTENCE)

    return Sentence(
        uuid=aug.next(),
        textSpan=TextSpan(sen_start, sen_end),
        tokenization=Tokenization(
            uuid=aug.next(),
            kind=TokenizationKind.TOKEN_LIST,
            metadata=AnnotationMetadata(
                tool=metadata_tool,
                timestamp=metadata_timestamp,
            ),
            tokenList=TokenList(tokenList=[
                Token(
                    tokenIndex=i,
                    text=tok_text,
                ) for (i, tok_text) in enumerate(sen_text.split())
            ]),
        ) if tokens else None,
    )
Example #7
0
def test_compute_lattice_expected_counts_rhombus():
    # 0 --0-> 1, -1
    # 1 --1-> 3, -2
    # 0 --0-> 2, -3
    # 2 --2-> 3, -4
    A = log(exp(-1 + -2) + exp(-3 + -4))
    expected = [0., -1 + -2 - A, -3 + -4 - A]
    actual = compute_lattice_expected_counts(
        TokenLattice(arcList=[
            Arc(src=0, dst=1, token=Token(tokenIndex=0), weight=-1.),
            Arc(src=1, dst=3, token=Token(tokenIndex=1), weight=-2.),
            Arc(src=0, dst=2, token=Token(tokenIndex=0), weight=-3.),
            Arc(src=2, dst=3, token=Token(tokenIndex=2), weight=-4.),
        ],
                     startState=0,
                     endState=3))
    assert allclose(expected, actual), '%s !~= %s' % (expected, actual)
def create_sentence_with_token(sentence_start, sentence_ending, token_start,
                               token_ending):
    token_textspan = TextSpan(start=token_start, ending=token_ending)
    token = Token(textSpan=token_textspan)
    tokenization = Tokenization(tokenList=TokenList(tokenList=[token]))
    sentence_textspan = TextSpan(start=sentence_start, ending=sentence_ending)
    sentence = Sentence(tokenization=tokenization,
                        textSpan=sentence_textspan,
                        uuid='TEST')
    return sentence
Example #9
0
def test_compute_lattice_expected_counts_one_arc():
    # 0 --0-> 1, -1
    expected = [0.]
    actual = compute_lattice_expected_counts(
        TokenLattice(arcList=[
            Arc(src=0, dst=1, token=Token(tokenIndex=0), weight=-1.),
        ],
                     startState=0,
                     endState=1))
    assert allclose(expected, actual), '%s !~= %s' % (expected, actual)
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(id=comm_id,
                         metadata=AnnotationMetadata(tool=toolname,
                                                     timestamp=timestamp),
                         type=toolname,
                         uuid=aug.next())

    tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST,
                                metadata=AnnotationMetadata(
                                    tool=toolname, timestamp=timestamp),
                                tokenList=TokenList(tokenList=[]),
                                uuid=aug.next())
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(
            Token(text=token_string, tokenIndex=i))

    sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)),
                        tokenization=tokenization,
                        uuid=aug.next())

    section = Section(kind="SectionKind",
                      sentenceList=[sentence],
                      textSpan=TextSpan(0, len(sentence_string)),
                      uuid=aug.next())

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm
def json_to_concrete(doc: Dict) -> Communication:
    metadata = AnnotationMetadata(
        tool="BlingBLing",
        timestamp=int(datetime.datetime.now().timestamp())
    )
    comm: Communication = Communication(
        uuid=augf.next(),
        id=doc['doc_key'],
        type="aida",
        metadata=metadata,
        lidList=[LanguageIdentification(
            uuid=augf.next(),
            metadata=metadata,
            languageToProbabilityMap={doc['language_id']: 1.0}
        )],
        sectionList=[Section(
            uuid=augf.next(),
            kind="passage",
            sentenceList=[
                Sentence(
                    uuid=augf.next(),
                    tokenization=Tokenization(
                        uuid=augf.next(),
                        kind=TokenizationKind.TOKEN_LIST,
                        metadata=metadata,
                        tokenList=TokenList(
                            tokenList=[
                                Token(
                                    tokenIndex=i,
                                    text=t
                                )
                                for i, t in enumerate(get_flatten_sentence(doc))
                            ]
                        )
                    )
                )
            ]
        )],
        entityMentionSetList=[EntityMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )],
        situationMentionSetList=[SituationMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )]
    )

    return comm
Example #12
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Example #13
0
def test_get_conll_tags_one_token_implicit_filter():
    tokenization = Tokenization(
        tokenList=TokenList(tokenList=[
            Token(tokenIndex=0, text='t0'),
        ]),
        dependencyParseList=[
            DependencyParse(dependencyList=[
                Dependency(gov=-1, dep=0, edgeType='edge_0/0'),
            ]),
        ],
    )

    assert _get_conll_tags_for_tokenization(tokenization) == [
        [(u'0', u'edge_0/0')],
    ]
Example #14
0
def test_communication_deep_copy():
    comm1 = create_comm('a-b-c', text='foo bar baz .')
    comm2 = communication_deep_copy(comm1)
    comm3 = communication_deep_copy(comm1)
    assert_simple_comms_equal(comm1, comm2)
    assert_simple_comms_equal(comm2, comm3)
    tkzn1 = comm1.sectionList[0].sentenceList[0].tokenization
    tkzn1.tokenList.tokenList[0] = Token(text='bbq', tokenIndex=0)
    tkzn2 = comm2.sectionList[0].sentenceList[0].tokenization
    assert list(map(
        lambda t: t.text, tkzn1.tokenList.tokenList
    )) != list(map(
        lambda t: t.text, tkzn2.tokenList.tokenList
    ))
    assert_simple_comms_equal(comm2, comm3)
Example #15
0
 def annotate(self, communication):
     print communication.id
     augf = AnalyticUUIDGeneratorFactory(communication)
     aug = augf.create()
     for section in communication.sectionList:
         for sentence in section.sentenceList:
             text = communication.text[sentence.textSpan.start:sentence.textSpan.ending]
             sentence.tokenization = Tokenization(uuid = aug.next(),
                                                  kind = TokenizationKind.TOKEN_LIST,
                                                  tokenList = TokenList(tokenList=[]),
                                                  tokenTaggingList = [],
                                                  metadata = AnnotationMetadata(timestamp=int(time.time()), tool="nltk"))
                                                  
             for i, token in enumerate(nltk.word_tokenize(text)):
                 logging.info("Found token %s", token)
                 sentence.tokenization.tokenList.tokenList.append(Token(tokenIndex=i, text=token))
     return communication
Example #16
0
def test_get_conll_tags_one_token():
    tokenization = Tokenization(
        tokenList=TokenList(tokenList=[
            Token(tokenIndex=0, text='t0'),
        ]),
        dependencyParseList=sentinel.dpl,
    )

    mock_filter_zero = Mock(return_value=[])
    assert _get_conll_tags_for_tokenization(tokenization,
                                            mock_filter_zero) == []
    mock_filter_zero.assert_called_with(sentinel.dpl)

    mock_filter_one_empty = Mock(return_value=[
        DependencyParse(dependencyList=[]),
    ])
    assert _get_conll_tags_for_tokenization(tokenization,
                                            mock_filter_one_empty) == [
                                                [(u'', u'')],
                                            ]
    mock_filter_one_empty.assert_called_with(sentinel.dpl)

    mock_filter_one = Mock(return_value=[
        DependencyParse(dependencyList=[
            Dependency(gov=-1, dep=0, edgeType='edge_0/0'),
        ]),
    ])
    assert _get_conll_tags_for_tokenization(tokenization, mock_filter_one) == [
        [(u'0', u'edge_0/0')],
    ]
    mock_filter_one.assert_called_with(sentinel.dpl)

    mock_filter_two = Mock(return_value=[
        DependencyParse(dependencyList=[
            Dependency(gov=-1, dep=0, edgeType='edge_0/0'),
        ]),
        DependencyParse(dependencyList=[
            Dependency(gov=-1, dep=0, edgeType='edge_0/1'),
        ]),
    ])
    assert _get_conll_tags_for_tokenization(tokenization, mock_filter_two) == [
        [(u'0', u'edge_0/0')],
        [(u'0', u'edge_0/1')],
    ]
    mock_filter_two.assert_called_with(sentinel.dpl)