def test_whitespace_nlp(self): raw = '''Hi! My name is Jason. You can call me Mr. J. Is that your name too? Ha. Ha ha. ''' doc = whitespace_nlp(raw) self.assertEqual(len(list(doc)), 73) self.assertEqual(len(doc.sents), 1) tok = Tok('WORD', 'Jason', 'jason', 'Name', 'NNP') self.assertEqual(len(tok), 5) self.assertEqual(str(tok), 'jason') self.assertEqual( str( Doc([[ Tok('WORD', 'Jason', 'jason', 'Name', 'NNP'), Tok('WORD', 'a', 'a', 'Name', 'NNP') ]], raw='asdfbasdfasd')), 'asdfbasdfasd') self.assertEqual( str( Doc([[ Tok('WORD', 'Blah', 'blah', 'Name', 'NNP'), Tok('Space', ' ', ' ', ' ', ' '), Tok('WORD', 'a', 'a', 'Name', 'NNP') ]])), 'blah a')
def whitespace_nlp_with_fake_chunks(doc, entity_type=None, tag_type=None): toks = _regex_parse_sentence(doc, entity_type, tag_type) words = [t for t in toks if t.pos_ == 'WORD'] if len(words) < 5: return Doc([toks]) else: return Doc([toks], noun_chunks=[Span(words[:2]), Span(words[1:3])])
def bad_whitespace_nlp(doc): toks = [] for tok in doc.split(): pos = 'WORD' if tok.strip() == '': pos = 'SPACE' elif re.match('^\W+$', tok): pos = 'PUNCT' toks.append(Tok(pos, tok[:2].lower(), tok.lower(), ent_type='', tag='')) return Doc([toks])
def _testing_nlp(doc): toks = [] for tok in re.split(r"(\W)", doc): pos = 'WORD' ent = '' tag = '' if tok.strip() == '': pos = 'SPACE' elif re.match('^\W+$', tok): pos = 'PUNCT' if tok == 'Tone': ent = 'PERSON' if tok == 'Brooklyn': ent = 'GPE' toks.append(Tok(pos, tok[:2].lower(), tok.lower(), ent, tag)) return Doc([toks])
def tokenize(self, doc): ''' doc: str, text to be tokenized ''' sents = [] decoded_text = self.decoder(doc) tokens = self.tokenizer.convert_ids_to_tokens( self.tokenizer(decoded_text)['input_ids'], skip_special_tokens=True) last_idx = 0 toks = [] for raw_token in tokens: token_surface_string = raw_token if ord(raw_token[0]) == 288: token_surface_string = raw_token[1:] if ord(raw_token[0]) == 266: # skip new lines last_idx += len(raw_token) continue token_idx = decoded_text.index(token_surface_string, last_idx) toks.append( Tok(_get_pos_tag(token_surface_string), token_surface_string.lower(), raw_token.lower(), ent_type='' if self.entity_type is None else self.entity_type.get(token_surface_string, ''), tag='' if self.tag_type is None else self.tag_type.get( token_surface_string, ''), idx=token_idx)) last_idx = token_idx + len(token_surface_string) if token_surface_string in ['.', '!', '?']: # idiot's sentence splitter sents.append(toks) toks = [] if len(toks) > 0: sents.append(toks) return Doc(sents, decoded_text)