Ejemplo n.º 1
0
    def test_whitespace_nlp(self):
        raw = '''Hi! My name
		is Jason.  You can call me
		Mr. J.  Is that your name too?
		Ha. Ha ha.
		'''
        doc = whitespace_nlp(raw)
        self.assertEqual(len(list(doc)), 73)
        self.assertEqual(len(doc.sents), 1)
        tok = Tok('WORD', 'Jason', 'jason', 'Name', 'NNP')
        self.assertEqual(len(tok), 5)
        self.assertEqual(str(tok), 'jason')
        self.assertEqual(
            str(
                Doc([[
                    Tok('WORD', 'Jason', 'jason', 'Name', 'NNP'),
                    Tok('WORD', 'a', 'a', 'Name', 'NNP')
                ]],
                    raw='asdfbasdfasd')), 'asdfbasdfasd')
        self.assertEqual(
            str(
                Doc([[
                    Tok('WORD', 'Blah', 'blah', 'Name', 'NNP'),
                    Tok('Space', ' ', ' ', ' ', ' '),
                    Tok('WORD', 'a', 'a', 'Name', 'NNP')
                ]])), 'blah a')
Ejemplo n.º 2
0
def whitespace_nlp_with_fake_chunks(doc, entity_type=None, tag_type=None):
    toks = _regex_parse_sentence(doc, entity_type, tag_type)
    words = [t for t in toks if t.pos_ == 'WORD']
    if len(words) < 5:
        return Doc([toks])
    else:
        return Doc([toks], noun_chunks=[Span(words[:2]), Span(words[1:3])])
Ejemplo n.º 3
0
def bad_whitespace_nlp(doc):
    toks = []
    for tok in doc.split():
        pos = 'WORD'
        if tok.strip() == '':
            pos = 'SPACE'
        elif re.match('^\W+$', tok):
            pos = 'PUNCT'
        toks.append(Tok(pos, tok[:2].lower(), tok.lower(), ent_type='',
                        tag=''))
    return Doc([toks])
Ejemplo n.º 4
0
def _testing_nlp(doc):
    toks = []
    for tok in re.split(r"(\W)", doc):
        pos = 'WORD'
        ent = ''
        tag = ''
        if tok.strip() == '':
            pos = 'SPACE'
        elif re.match('^\W+$', tok):
            pos = 'PUNCT'
        if tok == 'Tone':
            ent = 'PERSON'
        if tok == 'Brooklyn':
            ent = 'GPE'
        toks.append(Tok(pos, tok[:2].lower(), tok.lower(), ent, tag))
    return Doc([toks])
Ejemplo n.º 5
0
    def tokenize(self, doc):
        '''
        doc: str, text to be tokenized
        '''

        sents = []
        decoded_text = self.decoder(doc)
        tokens = self.tokenizer.convert_ids_to_tokens(
            self.tokenizer(decoded_text)['input_ids'],
            skip_special_tokens=True)

        last_idx = 0
        toks = []
        for raw_token in tokens:
            token_surface_string = raw_token
            if ord(raw_token[0]) == 288:
                token_surface_string = raw_token[1:]
            if ord(raw_token[0]) == 266:  # skip new lines
                last_idx += len(raw_token)
                continue
            token_idx = decoded_text.index(token_surface_string, last_idx)
            toks.append(
                Tok(_get_pos_tag(token_surface_string),
                    token_surface_string.lower(),
                    raw_token.lower(),
                    ent_type='' if self.entity_type is None else
                    self.entity_type.get(token_surface_string, ''),
                    tag='' if self.tag_type is None else self.tag_type.get(
                        token_surface_string, ''),
                    idx=token_idx))
            last_idx = token_idx + len(token_surface_string)
            if token_surface_string in ['.', '!',
                                        '?']:  # idiot's sentence splitter
                sents.append(toks)
                toks = []

        if len(toks) > 0:
            sents.append(toks)
        return Doc(sents, decoded_text)