def tokenize(self, text: Text) -> List[Token]: import mitie encoded_sentence = text.encode('utf-8') tokenized = mitie.tokenize_with_offsets(encoded_sentence) tokens = [self._token_from_offset(token, offset, encoded_sentence) for token, offset in tokenized] return tokens
def tokenize(self, text): # type: (Text) -> List[Token] import mitie _text = text.encode('utf-8') tokenized = mitie.tokenize_with_offsets(_text) tokens = [Token(token.decode('utf-8'), self._byte_to_char_offset(_text, offset)) for token, offset in tokenized] return tokens
def tokenize(self, text): # type: (Text) -> List[Token] import mitie _text = text.encode('utf-8') tokenized = mitie.tokenize_with_offsets(_text) tokens = [ Token(token.decode('utf-8'), self._byte_to_char_offset(_text, offset)) for token, offset in tokenized ] return tokens
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import mitie text = message.get(attribute) encoded_sentence = text.encode(DEFAULT_ENCODING) tokenized = mitie.tokenize_with_offsets(encoded_sentence) tokens = [ self._token_from_offset(token, offset, encoded_sentence) for token, offset in tokenized ] return self._apply_token_pattern(tokens)
def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]: import mitie encoded_sentence = text.encode(DEFAULT_ENCODING) tokenized = mitie.tokenize_with_offsets(encoded_sentence) tokens = [ self._token_from_offset(token, offset, encoded_sentence) for token, offset in tokenized ] self.add_cls_token(tokens, attribute) return tokens
def entities(extracted_text, lang): extracted_text = re.sub(r'[^\x00-\x7F]', ' ', extracted_text) extracted_text = extracted_text.replace("[:newline:]", " ") extracted_text = extracted_text.encode("ascii") #tokens = tokenize(body) tokens = tokenize_with_offsets(extracted_text) entities_markup = ner_models[lang].extract_entities(tokens) #results contains [(tag, entity, offset, score)] results = [(tag, " ".join([tokens[i][0] for i in rng]), ",".join([str(tokens[i][1]) for i in rng]), "{0:.2f}".format(score)) for rng, tag, score in entities_markup] entity_doc = {} # entity_doc["entity_full"] = results entity_doc["entity_all"] = [] entity_doc["entity_location"] = [] entity_doc["entity_organization"] = [] entity_doc["entity_person"] = [] entity_doc["entity_misc"] = [] for tag, entity, rng, score in results: if len(entity) > 30: continue entity_doc["entity_all"].append(entity) if tag == 'LOCATION' and score > 0.3: entity_doc["entity_location"].append(entity) elif tag == 'ORGANIZATION' and score > 0.5: entity_doc["entity_organization"].append(entity) elif tag == 'PERSON' and score > 0.3: entity_doc["entity_person"].append(entity) elif score > 0.5: entity_doc["entity_misc"].append(entity) return entity_doc
def entities(extracted_text, lang): extracted_text = re.sub(r'[^\x00-\x7F]',' ', extracted_text) extracted_text = extracted_text.replace("[:newline:]", " ") extracted_text = extracted_text.encode("ascii") #tokens = tokenize(body) tokens = tokenize_with_offsets(extracted_text) entities_markup = ner_models[lang].extract_entities(tokens) #results contains [(tag, entity, offset, score)] results = [ (tag, " ".join([tokens[i][0] for i in rng]), ",".join([str(tokens[i][1]) for i in rng]), "{0:.2f}".format(score)) for rng, tag, score in entities_markup ] entity_doc = {} # entity_doc["entity_full"] = results entity_doc["entity_all"] = [] entity_doc["entity_location"] = [] entity_doc["entity_organization"] = [] entity_doc["entity_person"] = [] entity_doc["entity_misc"] = [] for tag, entity, rng, score in results: if len(entity) > 30: continue entity_doc["entity_all"].append(entity) if tag == 'LOCATION' and score > 0.3: entity_doc["entity_location"].append(entity) elif tag == 'ORGANIZATION' and score > 0.5: entity_doc["entity_organization"].append(entity) elif tag == 'PERSON' and score > 0.3: entity_doc["entity_person"].append(entity) elif score > 0.5: entity_doc["entity_misc"].append(entity) return entity_doc