Ejemplo n.º 1
0
def comm_with_other_tags(*additional_tagging_types):
    comm = create_comm(
        'quick', '''\
The quick brown fox jumped
over the lazy dog .

Or did she ?
''')
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            sentence.tokenization.tokenTaggingList = [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'upper',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.upper(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'lower',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.lower(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
            ] + [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool/{}'.format(i),
                        timestamp=1,
                    ),
                    taggingType=tagging_type,
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag='{}_{}/{}'.format(tagging_type,
                                                  token.tokenIndex, i),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ) for (i, tagging_type) in enumerate(additional_tagging_types)
            ]
    return comm
Ejemplo n.º 2
0
def comm_with_other_tags(*additional_tagging_types):
    comm = create_comm('quick', '''\
The quick brown fox jumped
over the lazy dog .

Or did she ?
''')
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            sentence.tokenization.tokenTaggingList = [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'upper',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.upper(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'lower',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.lower(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
            ] + [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool/{}'.format(i),
                        timestamp=1,
                    ),
                    taggingType=tagging_type,
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag='{}_{}/{}'.format(tagging_type, token.tokenIndex, i),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                )
                for (i, tagging_type) in enumerate(additional_tagging_types)
            ]
    return comm
Ejemplo n.º 3
0
def test_validate_minimal_communication_with_uuid():
    comm = Communication()
    comm.id = "myID"
    comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time()))
    comm.type = "Test Communication"
    comm.uuid = generate_UUID()
    assert validate_communication(comm)
Ejemplo n.º 4
0
def test_validate_minimal_communication_with_uuid():
    comm = Communication()
    comm.id = "myID"
    comm.metadata = AnnotationMetadata(
        tool="TEST", timestamp=int(time.time()))
    comm.type = "Test Communication"
    comm.uuid = generate_UUID()
    assert validate_communication(comm)
Ejemplo n.º 5
0
    def search(self, query):
        logger.info("Received SearchQuery: '%s'" % query)
        search_result_items = []
        weights = (None if query.labels is None or len(query.labels) == 0 else
                   [float(e) for e in query.labels])
        entities, log_distribution = self.index.query(query.terms,
                                                      query.k,
                                                      weights=weights)
        lm = sorted(self.index.feat2id.items(),
                    key=lambda x: log_distribution[self.index.feat2id[x[0]]],
                    reverse=True)
        query.labels = [e[0] for e in lm[:self.k_query]]
        for guid, score in entities:
            search_result_item = SearchResultItem()
            uuid_sentences = self.index.guid2sent[guid]
            ss = np.empty((len(uuid_sentences), ))
            for idx, (uuid, sent) in enumerate(uuid_sentences):
                ss[idx] = lm_score(log_distribution, self.index.feat2id, sent)
            sorted_idi = np.argsort(ss)[-1:-self.k_rationale - 1:-1]
            sents = [' '.join(uuid_sentences[e][1]) for e in sorted_idi]
            uuidi = [uuid_sentences[e][0] for e in sorted_idi]
            search_result_item.communicationId = guid + '\n' + '\n'.join(sents)
            search_result_item.sentenceId = None
            search_result_item.score = score
            entity = Entity()
            entity.uuid = generate_UUID()
            entity.id = guid
            uuidList = []
            for single_uuid in uuidi:
                uuidObj = UUID()
                uuidObj.uuidString = single_uuid
                uuidList.append(uuidObj)
            entity.mentionIdList = uuidList
            search_result_item.entity = entity
            search_result_items.append(search_result_item)

        search_result = SearchResult()
        search_result.uuid = generate_UUID()
        search_result.searchResultItems = search_result_items
        search_result.searchQuery = query
        logger.info("Returned SearchResult with %d SearchResultItems\n" %
                    len(search_result.searchResultItems))
        return search_result
Ejemplo n.º 6
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Ejemplo n.º 7
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[Token(tokenIndex=0,
                                      text='text',
                                      textSpan=TextSpan(start=0,
                                                        ending=1))])
    tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(), kind='kind', label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(), entityType='entityType',
                       text='text', tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1',
                                   timestamp=ts)
    props = list(
        Property(
            value="Property%d" % i,
            metadata=meta_prop,
            polarity=4.0) for i in range(num_properties))
    am = MentionArgument(role='role', entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(),
                          tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(), id='id', text='text',
                         type='type', metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Ejemplo n.º 8
0
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction, toolname):
    timestamp = long(time.time()*1e6)
    tokens = tokenization.tokenList.tokenList
    new_pred = []
    start = 0
    for i, tk in enumerate(tokens):
        tg = ' '.join(prediction[start:start+len(tk.text)])
        #print tk.text, tg
        new_pred.append(TaggedToken(tokenIndex=i,tag=tg))
        start += len(tk.text)
    assert len(new_pred) == len(tokens)
    #print start, len(prediction)
    assert start == len(prediction)
    new_tokentagging = TokenTagging(
            taggingType=taggingType,
            taggedTokenList=new_pred,
            metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),
            uuid=generate_UUID())
    tokenization.tokenTaggingList.append(new_tokentagging)
Ejemplo n.º 9
0
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction,
                                toolname):
    timestamp = long(time.time() * 1e6)
    tokens = tokenization.tokenList.tokenList
    new_pred = []
    start = 0
    for i, tk in enumerate(tokens):
        tg = ' '.join(prediction[start:start + len(tk.text)])
        #print tk.text, tg
        new_pred.append(TaggedToken(tokenIndex=i, tag=tg))
        start += len(tk.text)
    assert len(new_pred) == len(tokens)
    #print start, len(prediction)
    assert start == len(prediction)
    new_tokentagging = TokenTagging(taggingType=taggingType,
                                    taggedTokenList=new_pred,
                                    metadata=AnnotationMetadata(
                                        tool=toolname, timestamp=timestamp),
                                    uuid=generate_UUID())
    tokenization.tokenTaggingList.append(new_tokentagging)
Ejemplo n.º 10
0
def test_generate_UUID():
    comm = Communication()
    comm.uuid = generate_UUID()
Ejemplo n.º 11
0
def update_concrete(comm, prediction):
    toolname = 'Violet_NER_annotator'
    timestamp = int(time.time())
    mention_list = []
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            start = 0
            pred_ner_tags = []
            tknzation = sentence.tokenization
            in_NE = False
            ne_type = ''
            tokenization_id = None
            token_idx_list = []
            ne_text = []
            for i, tk in enumerate(tknzation.tokenList.tokenList):
                pred_tags = ' '.join(prediction[start:start + len(tk.text)])
                if in_NE:
                    for i, tag in enumerate(prediction[start:start +
                                                       len(tk.text)]):
                        if tag != 'I-' + ne_type:
                            if i != 0:
                                token_idx_list.append(i)
                                ne_text.append(tk.text)
                            entity_tokens = TokenRefSequence(
                                tokenizationId=tokenization_id,
                                tokenIndexList=token_idx_list)
                            e_type, p_type = ne_type.split(
                                '.') if '.' in ne_type else (ne_type, 'NAM')
                            #print token_idx_list, ne_text, e_type, p_type
                            e_mention = EntityMention(uuid=generate_UUID(),
                                                      tokens=entity_tokens,
                                                      entityType=e_type,
                                                      phraseType=p_type,
                                                      text=''.join(ne_text))
                            mention_list.append(e_mention)
                            tokenization_id = None
                            token_idx_list = []
                            ne_text = []
                            ne_type = ''
                            in_NE = False
                            break
                if not in_NE and 'B-' in pred_tags:
                    #print 'not in NE,', prediction[start:start+len(tk.text)]
                    in_NE = True
                    for tag in prediction[start:start + len(tk.text)]:
                        #print tag
                        if tag.startswith('B-'):
                            ne_type = tag.split('-')[1]
                            tokenization_id = tknzation.uuid
                            token_idx_list.append(i)
                            ne_text.append(tk.text)
                            break
                    #print token_idx_list, ne_text
                    if prediction[start + len(tk.text) - 1] != 'I-' + ne_type:
                        entity_tokens = TokenRefSequence(
                            tokenizationId=tokenization_id,
                            tokenIndexList=token_idx_list)
                        e_type, p_type = ne_type.split(
                            '.') if '.' in ne_type else (ne_type, 'NAM')
                        e_mention = EntityMention(uuid=generate_UUID(),
                                                  tokens=entity_tokens,
                                                  entityType=e_type,
                                                  phraseType=p_type,
                                                  text=''.join(ne_text))
                        mention_list.append(e_mention)
                        tokenization_id = None
                        token_idx_list = []
                        ne_text = []
                        ne_type = ''
                        in_NE = False
                start += len(tk.text)
                pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags))
            pner_tokentagging = TokenTagging(taggingType=PRED_TAG,
                                             taggedTokenList=pred_ner_tags,
                                             metadata=AnnotationMetadata(
                                                 tool=toolname,
                                                 timestamp=timestamp),
                                             uuid=generate_UUID())
            tknzation.tokenTaggingList.append(pner_tokentagging)
    entity_list = [
        Entity(uuid=generate_UUID(),
               type=mention.entityType,
               canonicalName=mention.text,
               mentionIdList=[mention.uuid]) for mention in mention_list
    ]
    entity_mention_set = EntityMentionSet(uuid=generate_UUID(),
                                          metadata=AnnotationMetadata(
                                              tool=toolname,
                                              timestamp=timestamp),
                                          mentionList=mention_list)
    entity_set = EntitySet(uuid=generate_UUID(),
                           metadata=AnnotationMetadata(tool=toolname,
                                                       timestamp=timestamp),
                           entityList=entity_list,
                           mentionSetId=entity_mention_set.uuid)
    comm.entityMentionSetList = [entity_mention_set]
    comm.entitySetList = [entity_set]
Ejemplo n.º 12
0
def update_concrete(comm, prediction):
    toolname = 'Violet_NER_annotator'
    timestamp = int(time.time())
    mention_list = []
    for section in comm.sectionList:
	for sentence in section.sentenceList:
	    start = 0
	    pred_ner_tags = []
	    tknzation = sentence.tokenization
	    in_NE = False
	    ne_type = ''
	    tokenization_id = None
	    token_idx_list = []
	    ne_text = []
	    for i, tk in enumerate(tknzation.tokenList.tokenList):
		pred_tags = ' '.join(prediction[start:start+len(tk.text)])
		if in_NE:
			for i, tag in enumerate(prediction[start:start+len(tk.text)]):
				if tag != 'I-' + ne_type:
					if i != 0:
						token_idx_list.append(i)
						ne_text.append(tk.text)
					entity_tokens = TokenRefSequence(tokenizationId=tokenization_id, tokenIndexList=token_idx_list)
					e_type, p_type = ne_type.split('.') if '.' in ne_type else (ne_type, 'NAM')
					#print token_idx_list, ne_text, e_type, p_type
					e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text))
					mention_list.append(e_mention)
					tokenization_id = None
					token_idx_list = []
					ne_text = []
					ne_type = ''
					in_NE = False
					break
		if not in_NE and 'B-' in pred_tags:
			#print 'not in NE,', prediction[start:start+len(tk.text)]
			in_NE = True
			for tag in prediction[start:start+len(tk.text)]:
				#print tag
				if tag.startswith('B-'):
					ne_type = tag.split('-')[1]
					tokenization_id = tknzation.uuid
					token_idx_list.append(i)
					ne_text.append(tk.text)
					break
			#print token_idx_list, ne_text
			if prediction[start+len(tk.text)-1] != 'I-'+ne_type:
				entity_tokens = TokenRefSequence(tokenizationId=tokenization_id, tokenIndexList=token_idx_list)
				e_type, p_type = ne_type.split('.') if '.' in ne_type else (ne_type, 'NAM')
				e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens,entityType=e_type,phraseType=p_type,text=''.join(ne_text))
				mention_list.append(e_mention)
				tokenization_id = None
				token_idx_list = []
				ne_text = []
				ne_type = ''
				in_NE = False
		start += len(tk.text)
		pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags))
	    pner_tokentagging = TokenTagging(
		    taggingType=PRED_TAG,
		    taggedTokenList=pred_ner_tags,
		    metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),
		    uuid=generate_UUID())
	    tknzation.tokenTaggingList.append(pner_tokentagging)
    entity_list = [Entity(uuid=generate_UUID(),type=mention.entityType,canonicalName=mention.text,mentionIdList=[mention.uuid]) for mention in mention_list]
    entity_mention_set = EntityMentionSet(uuid=generate_UUID(),metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),mentionList=mention_list)
    entity_set = EntitySet(uuid=generate_UUID(),metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),entityList=entity_list,mentionSetId=entity_mention_set.uuid)
    comm.entityMentionSetList = [entity_mention_set]
    comm.entitySetList = [entity_set]
Ejemplo n.º 13
0
def test_generate_UUID():
    comm = Communication()
    comm.uuid = generate_UUID()