def create_section_with_sentence(section_start, section_ending, sentence_start, sentence_ending): sentence_textspan = TextSpan(start=sentence_start, ending=sentence_ending) sentence = Sentence(textSpan=sentence_textspan, uuid='TEST_SENTENCE') section_textspan = TextSpan(start=section_start, ending=section_ending) section = Section(sentenceList=[sentence], textSpan=section_textspan, uuid='TEST_SECTION') return section
def create_sentence_with_token(sentence_start, sentence_ending, token_start, token_ending): token_textspan = TextSpan(start=token_start, ending=token_ending) token = Token(textSpan=token_textspan) tokenization = Tokenization(tokenList=TokenList(tokenList=[token])) sentence_textspan = TextSpan(start=sentence_start, ending=sentence_ending) sentence = Sentence(tokenization=tokenization, textSpan=sentence_textspan, uuid='TEST') return sentence
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."): """Create a simple (valid) Communication suitable for testing purposes The Communication will have a single Section containing a single Sentence. Args: - `comm_id`: A string specifying a Communication ID - `sentence_string`: A string to be used for the sentence text. The string will be whitespace-tokenized. Returns: - A Concrete Communication object """ logging.warning('create_simple_comm will be removed in a future' ' release, please use create_comm instead') toolname = "TEST" timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = Communication(id=comm_id, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), type=toolname, uuid=aug.next()) tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), tokenList=TokenList(tokenList=[]), uuid=aug.next()) token_string_list = sentence_string.split() for i, token_string in enumerate(token_string_list): tokenization.tokenList.tokenList.append( Token(text=token_string, tokenIndex=i)) sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)), tokenization=tokenization, uuid=aug.next()) section = Section(kind="SectionKind", sentenceList=[sentence], textSpan=TextSpan(0, len(sentence_string)), uuid=aug.next()) comm.sectionList = [section] comm.text = sentence_string return comm
def create_sentence(sen_text, sen_start, sen_end, aug, metadata_tool, metadata_timestamp, annotation_level): ''' Create sentence from provided text and metadata. Lower-level routine (called indirectly by create_comm). ''' sections = (annotation_level is not None) and (annotation_level != AL_NONE) sentences = sections and (annotation_level != AL_SECTION) tokens = sentences and (annotation_level != AL_SENTENCE) return Sentence( uuid=aug.next(), textSpan=TextSpan(sen_start, sen_end), tokenization=Tokenization( uuid=aug.next(), kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=metadata_tool, timestamp=metadata_timestamp, ), tokenList=TokenList(tokenList=[ Token( tokenIndex=i, text=tok_text, ) for (i, tok_text) in enumerate(sen_text.split()) ]), ) if tokens else None, )
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[ Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1)) ]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def annotate(self, communication): augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: text = communication.text[section.textSpan.start:section.textSpan.ending] current_offset = section.textSpan.start for sent in nltk.sent_tokenize(text): logging.info("Found sentence %s", sent) initial = text.find(sent) s = Sentence(uuid=aug.next(), textSpan=TextSpan(start=current_offset + initial, ending=current_offset + initial + len(sent))) section.sentenceList.append(s) current_offset = current_offset + initial + len(sent) text = communication.text[current_offset:] return communication
def create_section(sec_text, sec_start, sec_end, section_kind, aug, metadata_tool, metadata_timestamp, annotation_level): ''' Create section from provided text and metadata. Lower-level routine (called by create_comm). ''' sections = (annotation_level is not None) and (annotation_level != AL_NONE) sentences = sections and (annotation_level != AL_SECTION) return Section( uuid=aug.next(), textSpan=TextSpan(sec_start, sec_end), kind=section_kind, sentenceList=([ create_sentence(sen_text, sec_start + sen_start, sec_start + sen_end, aug, metadata_tool, metadata_timestamp, annotation_level) for (sen_text, sen_start, sen_end) in _split(sec_text, '\n') ] if ('\n' in sec_text) or sec_text.strip() else []) if sentences else None, )
def index(): text = request.forms.get('text') transport = TTransport.TFramedTransport( TSocket.TSocket(options.annotator_host, options.annotator_port)) protocol = TCompactProtocol.TCompactProtocol(transport) client = Annotator.Client(protocol) transport.open() augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication( id="", text=text, uuid=aug.next(), type="user-supplied input", metadata=AnnotationMetadata(timestamp=int(time.time()), tool="stdin"), sectionList=[ Section(uuid=aug.next(), sentenceList=[], kind="paragraph", textSpan=TextSpan(start=0, ending=len(text))) ], entitySetList=[], entityMentionSetList=[], ) new_c = client.annotate(c) form = '''<form action="/" method="post"> Enter or paste some text: <input name="text" type="text" /> <input value="Submit" type="submit" /> </form> ''' return form + "\n".join(["<h3>%s</h3>" % text] + [ "\n".join([ "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList ]) for es in new_c.entitySetList ])
ofd = CommunicationWriterTGZ(options.output) with reader(gzip.open(options.input)) as ifd: for i, line in enumerate(ifd): toks = line.strip().split("\t") if len(toks) != 3: continue cid, label, text = toks g = ugf.create() t = int(time()) comm = Communication(id=cid, uuid=g.next(), type="Text document", text=text, communicationTaggingList=[CommunicationTagging(uuid=g.next(), metadata=AnnotationMetadata(tool="Gold labeling", timestamp=t, kBest=1, ), taggingType=options.tag_type, tagList=[label], confidenceList=[1.0], )], metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1), sectionList=[Section(uuid=g.next(), textSpan=TextSpan(start=0, ending=len(text)), kind="content", ) ]) ofd.write(comm) ofd.close()
transport.open() while True: s = raw_input("Write some text > ") if re.match(r"^\s*$", s): break else: augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication( id="", text=s, uuid=aug.next(), type="user-supplied input", metadata=AnnotationMetadata(timestamp=int(time.time()), tool="stdin"), sectionList=[ Section(uuid=aug.next(), sentenceList=[], kind="paragraph", textSpan=TextSpan(start=0, ending=len(s))) ], entitySetList=[], entityMentionSetList=[], ) new_c = client.annotate(c) for es in new_c.entitySetList: for e in es.entityList: print "%s %s" % (e.type, e.canonicalName)