def create_section_with_sentence(section_start, section_ending, sentence_start,
                                 sentence_ending):
    sentence_textspan = TextSpan(start=sentence_start, ending=sentence_ending)
    sentence = Sentence(textSpan=sentence_textspan, uuid='TEST_SENTENCE')
    section_textspan = TextSpan(start=section_start, ending=section_ending)
    section = Section(sentenceList=[sentence],
                      textSpan=section_textspan,
                      uuid='TEST_SECTION')
    return section
def create_sentence_with_token(sentence_start, sentence_ending, token_start,
                               token_ending):
    token_textspan = TextSpan(start=token_start, ending=token_ending)
    token = Token(textSpan=token_textspan)
    tokenization = Tokenization(tokenList=TokenList(tokenList=[token]))
    sentence_textspan = TextSpan(start=sentence_start, ending=sentence_ending)
    sentence = Sentence(tokenization=tokenization,
                        textSpan=sentence_textspan,
                        uuid='TEST')
    return sentence
Ejemplo n.º 3
0
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(id=comm_id,
                         metadata=AnnotationMetadata(tool=toolname,
                                                     timestamp=timestamp),
                         type=toolname,
                         uuid=aug.next())

    tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST,
                                metadata=AnnotationMetadata(
                                    tool=toolname, timestamp=timestamp),
                                tokenList=TokenList(tokenList=[]),
                                uuid=aug.next())
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(
            Token(text=token_string, tokenIndex=i))

    sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)),
                        tokenization=tokenization,
                        uuid=aug.next())

    section = Section(kind="SectionKind",
                      sentenceList=[sentence],
                      textSpan=TextSpan(0, len(sentence_string)),
                      uuid=aug.next())

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm
Ejemplo n.º 4
0
def create_sentence(sen_text, sen_start, sen_end, aug, metadata_tool,
                    metadata_timestamp, annotation_level):
    '''
    Create sentence from provided text and metadata.
    Lower-level routine (called indirectly by create_comm).
    '''

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)
    sentences = sections and (annotation_level != AL_SECTION)
    tokens = sentences and (annotation_level != AL_SENTENCE)

    return Sentence(
        uuid=aug.next(),
        textSpan=TextSpan(sen_start, sen_end),
        tokenization=Tokenization(
            uuid=aug.next(),
            kind=TokenizationKind.TOKEN_LIST,
            metadata=AnnotationMetadata(
                tool=metadata_tool,
                timestamp=metadata_timestamp,
            ),
            tokenList=TokenList(tokenList=[
                Token(
                    tokenIndex=i,
                    text=tok_text,
                ) for (i, tok_text) in enumerate(sen_text.split())
            ]),
        ) if tokens else None,
    )
Ejemplo n.º 5
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Ejemplo n.º 6
0
    def annotate(self, communication):
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()        

        for section in communication.sectionList:
            text = communication.text[section.textSpan.start:section.textSpan.ending]
            current_offset = section.textSpan.start            
            for sent in nltk.sent_tokenize(text):
                logging.info("Found sentence %s", sent)
                initial = text.find(sent)
                s = Sentence(uuid=aug.next(),
                             textSpan=TextSpan(start=current_offset + initial, ending=current_offset + initial + len(sent)))
                section.sentenceList.append(s)                    
                current_offset = current_offset + initial + len(sent)
                text = communication.text[current_offset:]
        return communication
Ejemplo n.º 7
0
def create_section(sec_text, sec_start, sec_end, section_kind, aug,
                   metadata_tool, metadata_timestamp, annotation_level):
    '''
    Create section from provided text and metadata.
    Lower-level routine (called by create_comm).
    '''

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)
    sentences = sections and (annotation_level != AL_SECTION)

    return Section(
        uuid=aug.next(),
        textSpan=TextSpan(sec_start, sec_end),
        kind=section_kind,
        sentenceList=([
            create_sentence(sen_text, sec_start + sen_start, sec_start +
                            sen_end, aug, metadata_tool, metadata_timestamp,
                            annotation_level)
            for (sen_text, sen_start, sen_end) in _split(sec_text, '\n')
        ] if ('\n' in sec_text) or sec_text.strip() else [])
        if sentences else None,
    )
Ejemplo n.º 8
0
    def index():
        text = request.forms.get('text')
        transport = TTransport.TFramedTransport(
            TSocket.TSocket(options.annotator_host, options.annotator_port))
        protocol = TCompactProtocol.TCompactProtocol(transport)
        client = Annotator.Client(protocol)
        transport.open()
        augf = AnalyticUUIDGeneratorFactory()
        aug = augf.create()
        c = Communication(
            id="",
            text=text,
            uuid=aug.next(),
            type="user-supplied input",
            metadata=AnnotationMetadata(timestamp=int(time.time()),
                                        tool="stdin"),
            sectionList=[
                Section(uuid=aug.next(),
                        sentenceList=[],
                        kind="paragraph",
                        textSpan=TextSpan(start=0, ending=len(text)))
            ],
            entitySetList=[],
            entityMentionSetList=[],
        )

        new_c = client.annotate(c)
        form = '''<form action="/" method="post">
        Enter or paste some text: <input name="text" type="text" />
        <input value="Submit" type="submit" />
        </form>
        '''
        return form + "\n".join(["<h3>%s</h3>" % text] + [
            "\n".join([
                "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList
            ]) for es in new_c.entitySetList
        ])
Ejemplo n.º 9
0
    ofd = CommunicationWriterTGZ(options.output)
    with reader(gzip.open(options.input)) as ifd:
        for i, line in enumerate(ifd):
            toks = line.strip().split("\t")
            if len(toks) != 3:
                continue            
            cid, label, text = toks
            g = ugf.create()
            t = int(time())
            comm = Communication(id=cid,
                                 uuid=g.next(),
                                 type="Text document",
                                 text=text,
                                 communicationTaggingList=[CommunicationTagging(uuid=g.next(),
                                                                                metadata=AnnotationMetadata(tool="Gold labeling",
                                                                                                            timestamp=t,
                                                                                                            kBest=1,
                                                                                ),
                                                                                taggingType=options.tag_type,
                                                                                tagList=[label],
                                                                                confidenceList=[1.0],
                                 )],
                                 metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1),
                                 sectionList=[Section(uuid=g.next(),
                                                      textSpan=TextSpan(start=0, ending=len(text)),
                                                      kind="content",
                                                      )
                                 ])
            ofd.write(comm)
    ofd.close()
Ejemplo n.º 10
0
    transport.open()

    while True:
        s = raw_input("Write some text > ")
        if re.match(r"^\s*$", s):
            break
        else:
            augf = AnalyticUUIDGeneratorFactory()
            aug = augf.create()
            c = Communication(
                id="",
                text=s,
                uuid=aug.next(),
                type="user-supplied input",
                metadata=AnnotationMetadata(timestamp=int(time.time()),
                                            tool="stdin"),
                sectionList=[
                    Section(uuid=aug.next(),
                            sentenceList=[],
                            kind="paragraph",
                            textSpan=TextSpan(start=0, ending=len(s)))
                ],
                entitySetList=[],
                entityMentionSetList=[],
            )

            new_c = client.annotate(c)
            for es in new_c.entitySetList:
                for e in es.entityList:
                    print "%s %s" % (e.type, e.canonicalName)