Python Tokenizationの例、concrete.Tokenization Pythonの例

コード例 #1

0

ファイルを表示

def test_repr_on_tokenization():
    tokenization = Tokenization(
        metadata=AnnotationMetadata(
            tool="test", timestamp=int(time.time())),
        uuid=UUID(uuidString='01234567-0123-4567-89ab-cdef89abcdef')
    )
    tokenization.__repr__()

コード例 #2

0

ファイルを表示

 def test_repr_on_tokenization(self):
     tokenization = Tokenization(
         metadata=AnnotationMetadata(
             tool="test", timestamp=int(time.time())),
         uuid=UUID(uuidString='01234567-0123-4567-89ab-cdef89abcdef')
     )
     tokenization.__repr__()

コード例 #3

0

ファイルを表示

ファイル: simple_comm.py プロジェクト: anandsahuja/concrete-python

def create_sentence(sen_text, sen_start, sen_end, aug, metadata_tool,
                    metadata_timestamp, annotation_level):
    '''
    Create sentence from provided text and metadata.
    Lower-level routine (called indirectly by create_comm).
    '''

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)
    sentences = sections and (annotation_level != AL_SECTION)
    tokens = sentences and (annotation_level != AL_SENTENCE)

    return Sentence(
        uuid=aug.next(),
        textSpan=TextSpan(sen_start, sen_end),
        tokenization=Tokenization(
            uuid=aug.next(),
            kind=TokenizationKind.TOKEN_LIST,
            metadata=AnnotationMetadata(
                tool=metadata_tool,
                timestamp=metadata_timestamp,
            ),
            tokenList=TokenList(tokenList=[
                Token(
                    tokenIndex=i,
                    text=tok_text,
                ) for (i, tok_text) in enumerate(sen_text.split())
            ]),
        ) if tokens else None,
    )

コード例 #4

0

ファイルを表示

def test_get_conll_tags_zero_tokens_implicit_filter():
    tokenization = Tokenization(tokenList=TokenList(tokenList=[]),
                                dependencyParseList=[
                                    DependencyParse(dependencyList=[]),
                                ])

    assert _get_conll_tags_for_tokenization(tokenization) == [[]]

コード例 #5

0

ファイルを表示

def test_get_conll_tags_no_token_list():
    tokenization = Tokenization()

    assert _get_conll_tags_for_tokenization(tokenization) == []

    mock_filter = Mock(return_value=[])
    assert _get_conll_tags_for_tokenization(tokenization, mock_filter) == []

コード例 #6

0

ファイルを表示

ファイル: test_validate_communication.py プロジェクト: hltcoe/concrete-python

def create_sentence_with_token(sentence_start, sentence_ending, token_start,
                               token_ending):
    token_textspan = TextSpan(start=token_start, ending=token_ending)
    token = Token(textSpan=token_textspan)
    tokenization = Tokenization(tokenList=TokenList(tokenList=[token]))
    sentence_textspan = TextSpan(start=sentence_start, ending=sentence_ending)
    sentence = Sentence(tokenization=tokenization,
                        textSpan=sentence_textspan,
                        uuid='TEST')
    return sentence

コード例 #7

0

ファイルを表示

ファイル: simple_comm.py プロジェクト: anandsahuja/concrete-python

def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(id=comm_id,
                         metadata=AnnotationMetadata(tool=toolname,
                                                     timestamp=timestamp),
                         type=toolname,
                         uuid=aug.next())

    tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST,
                                metadata=AnnotationMetadata(
                                    tool=toolname, timestamp=timestamp),
                                tokenList=TokenList(tokenList=[]),
                                uuid=aug.next())
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(
            Token(text=token_string, tokenIndex=i))

    sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)),
                        tokenization=tokenization,
                        uuid=aug.next())

    section = Section(kind="SectionKind",
                      sentenceList=[sentence],
                      textSpan=TextSpan(0, len(sentence_string)),
                      uuid=aug.next())

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm

コード例 #8

0

ファイルを表示

def test_get_conll_tags_zero_tokens():
    tokenization = Tokenization(
        tokenList=TokenList(tokenList=[]),
        dependencyParseList=sentinel.dpl,
    )

    mock_filter = Mock(return_value=[
        DependencyParse(dependencyList=[]),
    ])
    assert _get_conll_tags_for_tokenization(tokenization, mock_filter) == [[]]
    mock_filter.assert_called_with(sentinel.dpl)

コード例 #9

0

ファイルを表示

ファイル: rams-to-concrete-old.py プロジェクト: wanmok/joint-arglinking

def json_to_concrete(doc: Dict) -> Communication:
    metadata = AnnotationMetadata(
        tool="BlingBLing",
        timestamp=int(datetime.datetime.now().timestamp())
    )
    comm: Communication = Communication(
        uuid=augf.next(),
        id=doc['doc_key'],
        type="aida",
        metadata=metadata,
        lidList=[LanguageIdentification(
            uuid=augf.next(),
            metadata=metadata,
            languageToProbabilityMap={doc['language_id']: 1.0}
        )],
        sectionList=[Section(
            uuid=augf.next(),
            kind="passage",
            sentenceList=[
                Sentence(
                    uuid=augf.next(),
                    tokenization=Tokenization(
                        uuid=augf.next(),
                        kind=TokenizationKind.TOKEN_LIST,
                        metadata=metadata,
                        tokenList=TokenList(
                            tokenList=[
                                Token(
                                    tokenIndex=i,
                                    text=t
                                )
                                for i, t in enumerate(get_flatten_sentence(doc))
                            ]
                        )
                    )
                )
            ]
        )],
        entityMentionSetList=[EntityMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )],
        situationMentionSetList=[SituationMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )]
    )

    return comm

コード例 #10

0

ファイルを表示

def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm

コード例 #11

0

ファイルを表示

def test_get_conll_tags_one_token_implicit_filter():
    tokenization = Tokenization(
        tokenList=TokenList(tokenList=[
            Token(tokenIndex=0, text='t0'),
        ]),
        dependencyParseList=[
            DependencyParse(dependencyList=[
                Dependency(gov=-1, dep=0, edgeType='edge_0/0'),
            ]),
        ],
    )

    assert _get_conll_tags_for_tokenization(tokenization) == [
        [(u'0', u'edge_0/0')],
    ]

コード例 #12

0

ファイルを表示

ファイル: word_tokenizer.py プロジェクト: eliasdata/docker-nltk

 def annotate(self, communication):
     print communication.id
     augf = AnalyticUUIDGeneratorFactory(communication)
     aug = augf.create()
     for section in communication.sectionList:
         for sentence in section.sentenceList:
             text = communication.text[sentence.textSpan.start:sentence.textSpan.ending]
             sentence.tokenization = Tokenization(uuid = aug.next(),
                                                  kind = TokenizationKind.TOKEN_LIST,
                                                  tokenList = TokenList(tokenList=[]),
                                                  tokenTaggingList = [],
                                                  metadata = AnnotationMetadata(timestamp=int(time.time()), tool="nltk"))
                                                  
             for i, token in enumerate(nltk.word_tokenize(text)):
                 logging.info("Found token %s", token)
                 sentence.tokenization.tokenList.tokenList.append(Token(tokenIndex=i, text=token))
     return communication

コード例 #13

0

ファイルを表示

def test_get_conll_tags_one_token():
    tokenization = Tokenization(
        tokenList=TokenList(tokenList=[
            Token(tokenIndex=0, text='t0'),
        ]),
        dependencyParseList=sentinel.dpl,
    )

    mock_filter_zero = Mock(return_value=[])
    assert _get_conll_tags_for_tokenization(tokenization,
                                            mock_filter_zero) == []
    mock_filter_zero.assert_called_with(sentinel.dpl)

    mock_filter_one_empty = Mock(return_value=[
        DependencyParse(dependencyList=[]),
    ])
    assert _get_conll_tags_for_tokenization(tokenization,
                                            mock_filter_one_empty) == [
                                                [(u'', u'')],
                                            ]
    mock_filter_one_empty.assert_called_with(sentinel.dpl)

    mock_filter_one = Mock(return_value=[
        DependencyParse(dependencyList=[
            Dependency(gov=-1, dep=0, edgeType='edge_0/0'),
        ]),
    ])
    assert _get_conll_tags_for_tokenization(tokenization, mock_filter_one) == [
        [(u'0', u'edge_0/0')],
    ]
    mock_filter_one.assert_called_with(sentinel.dpl)

    mock_filter_two = Mock(return_value=[
        DependencyParse(dependencyList=[
            Dependency(gov=-1, dep=0, edgeType='edge_0/0'),
        ]),
        DependencyParse(dependencyList=[
            Dependency(gov=-1, dep=0, edgeType='edge_0/1'),
        ]),
    ])
    assert _get_conll_tags_for_tokenization(tokenization, mock_filter_two) == [
        [(u'0', u'edge_0/0')],
        [(u'0', u'edge_0/1')],
    ]
    mock_filter_two.assert_called_with(sentinel.dpl)

コード例 #14

0

ファイルを表示

def tokenization(request):
    return Tokenization(tokenTaggingList=[
        TokenTagging(
            metadata=AnnotationMetadata(tool='x'),
            taggingType='?',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='?'),
                TaggedToken(tokenIndex=1, tag='?'),
                TaggedToken(tokenIndex=2, tag='?'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='x'),
            taggingType='POS',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='N'),
                TaggedToken(tokenIndex=2, tag='X'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='y'),
            taggingType='NUMERAL',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='N'),
                TaggedToken(tokenIndex=2, tag='Y'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='y'),
            taggingType='LEMMA',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='mambo'),
                TaggedToken(tokenIndex=1, tag='number'),
                TaggedToken(tokenIndex=2, tag='4'),
            ],
        ),
    ], )

コード例 #15

0

ファイルを表示

def test_get_tokens_invalid_kind():
    with raises(ValueError):
        get_tokens(Tokenization(kind='invalid-kind'))