def test_repr_on_tokenization(): tokenization = Tokenization( metadata=AnnotationMetadata( tool="test", timestamp=int(time.time())), uuid=UUID(uuidString='01234567-0123-4567-89ab-cdef89abcdef') ) tokenization.__repr__()
def test_repr_on_tokenization(self): tokenization = Tokenization( metadata=AnnotationMetadata( tool="test", timestamp=int(time.time())), uuid=UUID(uuidString='01234567-0123-4567-89ab-cdef89abcdef') ) tokenization.__repr__()
def create_sentence(sen_text, sen_start, sen_end, aug, metadata_tool, metadata_timestamp, annotation_level): ''' Create sentence from provided text and metadata. Lower-level routine (called indirectly by create_comm). ''' sections = (annotation_level is not None) and (annotation_level != AL_NONE) sentences = sections and (annotation_level != AL_SECTION) tokens = sentences and (annotation_level != AL_SENTENCE) return Sentence( uuid=aug.next(), textSpan=TextSpan(sen_start, sen_end), tokenization=Tokenization( uuid=aug.next(), kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=metadata_tool, timestamp=metadata_timestamp, ), tokenList=TokenList(tokenList=[ Token( tokenIndex=i, text=tok_text, ) for (i, tok_text) in enumerate(sen_text.split()) ]), ) if tokens else None, )
def test_get_conll_tags_zero_tokens_implicit_filter(): tokenization = Tokenization(tokenList=TokenList(tokenList=[]), dependencyParseList=[ DependencyParse(dependencyList=[]), ]) assert _get_conll_tags_for_tokenization(tokenization) == [[]]
def test_get_conll_tags_no_token_list(): tokenization = Tokenization() assert _get_conll_tags_for_tokenization(tokenization) == [] mock_filter = Mock(return_value=[]) assert _get_conll_tags_for_tokenization(tokenization, mock_filter) == []
def create_sentence_with_token(sentence_start, sentence_ending, token_start, token_ending): token_textspan = TextSpan(start=token_start, ending=token_ending) token = Token(textSpan=token_textspan) tokenization = Tokenization(tokenList=TokenList(tokenList=[token])) sentence_textspan = TextSpan(start=sentence_start, ending=sentence_ending) sentence = Sentence(tokenization=tokenization, textSpan=sentence_textspan, uuid='TEST') return sentence
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."): """Create a simple (valid) Communication suitable for testing purposes The Communication will have a single Section containing a single Sentence. Args: - `comm_id`: A string specifying a Communication ID - `sentence_string`: A string to be used for the sentence text. The string will be whitespace-tokenized. Returns: - A Concrete Communication object """ logging.warning('create_simple_comm will be removed in a future' ' release, please use create_comm instead') toolname = "TEST" timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = Communication(id=comm_id, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), type=toolname, uuid=aug.next()) tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), tokenList=TokenList(tokenList=[]), uuid=aug.next()) token_string_list = sentence_string.split() for i, token_string in enumerate(token_string_list): tokenization.tokenList.tokenList.append( Token(text=token_string, tokenIndex=i)) sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)), tokenization=tokenization, uuid=aug.next()) section = Section(kind="SectionKind", sentenceList=[sentence], textSpan=TextSpan(0, len(sentence_string)), uuid=aug.next()) comm.sectionList = [section] comm.text = sentence_string return comm
def test_get_conll_tags_zero_tokens(): tokenization = Tokenization( tokenList=TokenList(tokenList=[]), dependencyParseList=sentinel.dpl, ) mock_filter = Mock(return_value=[ DependencyParse(dependencyList=[]), ]) assert _get_conll_tags_for_tokenization(tokenization, mock_filter) == [[]] mock_filter.assert_called_with(sentinel.dpl)
def json_to_concrete(doc: Dict) -> Communication: metadata = AnnotationMetadata( tool="BlingBLing", timestamp=int(datetime.datetime.now().timestamp()) ) comm: Communication = Communication( uuid=augf.next(), id=doc['doc_key'], type="aida", metadata=metadata, lidList=[LanguageIdentification( uuid=augf.next(), metadata=metadata, languageToProbabilityMap={doc['language_id']: 1.0} )], sectionList=[Section( uuid=augf.next(), kind="passage", sentenceList=[ Sentence( uuid=augf.next(), tokenization=Tokenization( uuid=augf.next(), kind=TokenizationKind.TOKEN_LIST, metadata=metadata, tokenList=TokenList( tokenList=[ Token( tokenIndex=i, text=t ) for i, t in enumerate(get_flatten_sentence(doc)) ] ) ) ) ] )], entityMentionSetList=[EntityMentionSet( uuid=augf.next(), metadata=metadata, mentionList=[] )], situationMentionSetList=[SituationMentionSet( uuid=augf.next(), metadata=metadata, mentionList=[] )] ) return comm
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[ Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1)) ]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def test_get_conll_tags_one_token_implicit_filter(): tokenization = Tokenization( tokenList=TokenList(tokenList=[ Token(tokenIndex=0, text='t0'), ]), dependencyParseList=[ DependencyParse(dependencyList=[ Dependency(gov=-1, dep=0, edgeType='edge_0/0'), ]), ], ) assert _get_conll_tags_for_tokenization(tokenization) == [ [(u'0', u'edge_0/0')], ]
def annotate(self, communication): print communication.id augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: for sentence in section.sentenceList: text = communication.text[sentence.textSpan.start:sentence.textSpan.ending] sentence.tokenization = Tokenization(uuid = aug.next(), kind = TokenizationKind.TOKEN_LIST, tokenList = TokenList(tokenList=[]), tokenTaggingList = [], metadata = AnnotationMetadata(timestamp=int(time.time()), tool="nltk")) for i, token in enumerate(nltk.word_tokenize(text)): logging.info("Found token %s", token) sentence.tokenization.tokenList.tokenList.append(Token(tokenIndex=i, text=token)) return communication
def test_get_conll_tags_one_token(): tokenization = Tokenization( tokenList=TokenList(tokenList=[ Token(tokenIndex=0, text='t0'), ]), dependencyParseList=sentinel.dpl, ) mock_filter_zero = Mock(return_value=[]) assert _get_conll_tags_for_tokenization(tokenization, mock_filter_zero) == [] mock_filter_zero.assert_called_with(sentinel.dpl) mock_filter_one_empty = Mock(return_value=[ DependencyParse(dependencyList=[]), ]) assert _get_conll_tags_for_tokenization(tokenization, mock_filter_one_empty) == [ [(u'', u'')], ] mock_filter_one_empty.assert_called_with(sentinel.dpl) mock_filter_one = Mock(return_value=[ DependencyParse(dependencyList=[ Dependency(gov=-1, dep=0, edgeType='edge_0/0'), ]), ]) assert _get_conll_tags_for_tokenization(tokenization, mock_filter_one) == [ [(u'0', u'edge_0/0')], ] mock_filter_one.assert_called_with(sentinel.dpl) mock_filter_two = Mock(return_value=[ DependencyParse(dependencyList=[ Dependency(gov=-1, dep=0, edgeType='edge_0/0'), ]), DependencyParse(dependencyList=[ Dependency(gov=-1, dep=0, edgeType='edge_0/1'), ]), ]) assert _get_conll_tags_for_tokenization(tokenization, mock_filter_two) == [ [(u'0', u'edge_0/0')], [(u'0', u'edge_0/1')], ] mock_filter_two.assert_called_with(sentinel.dpl)
def tokenization(request): return Tokenization(tokenTaggingList=[ TokenTagging( metadata=AnnotationMetadata(tool='x'), taggingType='?', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='?'), TaggedToken(tokenIndex=1, tag='?'), TaggedToken(tokenIndex=2, tag='?'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='x'), taggingType='POS', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='N'), TaggedToken(tokenIndex=2, tag='X'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='y'), taggingType='NUMERAL', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='N'), TaggedToken(tokenIndex=2, tag='Y'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='y'), taggingType='LEMMA', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='mambo'), TaggedToken(tokenIndex=1, tag='number'), TaggedToken(tokenIndex=2, tag='4'), ], ), ], )
def test_get_tokens_invalid_kind(): with raises(ValueError): get_tokens(Tokenization(kind='invalid-kind'))