Ejemplo n.º 1
0
def parse_document(document, information):
    print 'Acquiring extractor'
    extractor = named_entity_extractor(
        os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat'))

    print 'Cleaning HTML'
    cleaned_document = clean_html(document)

    print 'Tokenizing'
    tokens = filter(None, tokenize(cleaned_document))

    print 'Extracting NER Entities'
    entities = extractor.extract_entities(tokens)

    print 'Done'
    normalizes_entities = []

    for position, tag in entities:
        position_indices = list(position)
        # TODO: join how for other languages?
        words = ' '.join(tokens[position_indices[0]:position_indices[-1]])
        if words:
            normalizes_entities.append((tag, words))

    return normalizes_entities
Ejemplo n.º 2
0
 def tokenize_with_offsets(self, text):
     _text = text.encode('utf-8')
     offsets = []
     offset = 0
     tokens = [w.decode('utf-8') for w in tokenize(_text)]
     for tok in tokens:
         m = re.search(re.escape(tok), text[offset:], re.UNICODE)
         offsets.append(offset + m.start())
         offset += m.end()
     return tokens, offsets
Ejemplo n.º 3
0
    def train(self, training_data, mitie_file, num_threads):
        # type: (TrainingData, str, Optional[int]) -> None
        from mitie import tokenize, text_categorizer_trainer

        trainer = text_categorizer_trainer(mitie_file)
        trainer.num_threads = num_threads
        for example in training_data.intent_examples:
            tokens = tokenize(example["text"])
            trainer.add_labeled_text(tokens, example["intent"])
        self.clf = trainer.train()
Ejemplo n.º 4
0
 def tokenize_with_offsets(self, text):
     _text = text.encode('utf-8')
     offsets = []
     offset = 0
     tokens = [w.decode('utf-8') for w in tokenize(_text)]
     for tok in tokens:
         m = re.search(tok, _text[offset:])
         offset += m.start()
         offsets.append(offset)
     return tokens, offsets
Ejemplo n.º 5
0
    def features_for_sentences(self, sentences, feature_extractor):
        # type: (List[Text], mitie.total_word_feature_extractor) -> np.ndarray
        import mitie
        import numpy as np

        X = np.zeros((len(sentences), self.ndim(feature_extractor)))

        for idx, sentence in enumerate(sentences):
            tokens = mitie.tokenize(sentence)
            X[idx, :] = self.features_for_tokens(tokens, feature_extractor)
        return X
Ejemplo n.º 6
0
    def features_for_sentences(self, sentences, feature_extractor):
        # type: (List[Text], mitie.total_word_feature_extractor) -> np.ndarray
        import mitie
        import numpy as np

        X = np.zeros((len(sentences), self.ndim(feature_extractor)))

        for idx, sentence in enumerate(sentences):
            tokens = mitie.tokenize(sentence)
            X[idx, :] = self.features_for_tokens(tokens, feature_extractor)
        return X
Ejemplo n.º 7
0
def mitie_context(text, ner_model):
    """
    Send text to MITIE NER, format the results, and return them with the 3 words
    on either side of every extracted entity.

    The context words can be used to filter results (e.g., if it says "the province of Aleppo", look
    for an admin area rather than a city.
    This version does not produce any HTML marked up text.

    Parameters
    ----------
    text: string
          The text to have its entities extracted
    ner_model: MITIE named entity extractor
               The NER model produced by `setup_mitie`

    Returns
    -------
    named_entities: dictionary
                    "entities" contains a list of dictionaries. Each of these
                    dicts has keys "tag", "text", and "score".
    """
    text = text.encode("utf-8")
    tokens = mitie.tokenize(text)
    # eventually, handle different NER models.
    entities = ner_model.extract_entities(tokens)
    out = []
    for e in entities:
        range = e[0]
        tag = e[1]
        score = e[2]
        entity_text = str(" ").join(tokens[i] for i in range)
        beg_token = min(range)
        end_token = max(range)

        context = []
        for i in [3, 2, 1]:
            try:
                context.append(tokens[beg_token - i])
            except:
                pass
            try:
                context.append(tokens[end_token + i])
            except:
                pass

        out.append({
            u'tag': unicode(tag),
            u'text': entity_text,
            u'score': score,
            u'context': context
        })
    return {"entities": out}
Ejemplo n.º 8
0
def _nlp_extract_metadata_core(text=None):

    if text:
        global ner
        from mitie import tokenize

        tokens = tokenize(text)
        entities = ner.extract_entities(tokens)
        locations = []
        organizations = []
        for e in entities:
            range = e[0]
            tag = e[1]
            score = e[2]
            # score_text = "{:0.3f}".format(score)
            entity_text = " ".join(tokens[i] for i in range)
            if tag == "LOCATION":
                locations.append((entity_text, score))
            elif tag == "ORGANIZATION":
                organizations.append((entity_text, score))
            # print tag+" : "+entity_text+" : "+score_text

        # Remove Duplicates
        locations = removeDuplicateEntities(locations)
        organizations = removeDuplicateEntities(organizations)

        # Resolve Locations to Regions
        regions = []
        for location in locations:
            location_text, score = location
            if score > settings.NLP_LOCATION_THRESHOLD:
                try:
                    region = Region.objects.get(name__iexact=location_text)
                    if region:
                        regions.append(region)
                except:
                    pass

        # Resolve organizations to Keywords/Tags
        keywords = []
        for organization in organizations:
            organization_text, score = organization
            try:
                keyword = Tag.objects.get(name__iexact=organization_text)
                if keyword:
                    keywords.append(keyword.name)
            except:
                pass

        return {'regions': regions, 'keywords': keywords}

    else:
        return None
Ejemplo n.º 9
0
def _nlp_extract_metadata_core(text=None):

    if text:
        global ner
        from mitie import tokenize

        tokens = tokenize(text)
        entities = ner.extract_entities(tokens)
        locations = []
        organizations = []
        for e in entities:
            range = e[0]
            tag = e[1]
            score = e[2]
            # score_text = "{:0.3f}".format(score)
            entity_text = " ".join(tokens[i] for i in range)
            if tag == "LOCATION":
                locations.append((entity_text, score))
            elif tag == "ORGANIZATION":
                organizations.append((entity_text, score))
            # print tag+" : "+entity_text+" : "+score_text

        # Remove Duplicates
        locations = removeDuplicateEntities(locations)
        organizations = removeDuplicateEntities(organizations)

        # Resolve Locations to Regions
        regions = []
        for location in locations:
            location_text, score = location
            if score > settings.NLP_LOCATION_THRESHOLD:
                try:
                    region = Region.objects.get(name__iexact=location_text)
                    if region:
                        regions.append(region)
                except BaseException:
                    pass

        # Resolve organizations to Keywords/Tags
        keywords = []
        for organization in organizations:
            organization_text, score = organization
            try:
                keyword = Tag.objects.get(name__iexact=organization_text)
                if keyword:
                    keywords.append(keyword.name)
            except BaseException:
                pass

        return {'regions': regions, 'keywords': keywords}

    else:
        return None
Ejemplo n.º 10
0
def find_entity(ent, text):
    tk = MITIETokenizer()
    tokens, offsets = tk.tokenize_with_offsets(text)
    if ent["start"] not in offsets:
        message = u"invalid entity {0} in example {1}:".format(ent, text) + \
                  u" entities must span whole tokens"
        raise ValueError(message)
    start = offsets.index(ent["start"])
    _slice = text[ent["start"]:ent["end"]]
    val_tokens = tokenize(_slice)
    end = start + len(val_tokens)
    return start, end
Ejemplo n.º 11
0
def train_entity_extractor(entity_examples, fe_file, max_num_threads):
    trainer = ner_trainer(fe_file)
    trainer.num_threads = max_num_threads
    for example in entity_examples:
        text = example["text"]
        tokens = tokenize(text)
        sample = ner_training_instance(tokens)
        for ent in example["entities"]:
            start, end = find_entity(ent, text)
            sample.add_entity(xrange(start, end), ent["entity"])

        trainer.add(sample)
    return trainer.train()
Ejemplo n.º 12
0
    def train(self, training_data, mitie_file, num_threads):
        # type: (TrainingData, Text, Optional[int]) -> None
        import mitie

        trainer = mitie.text_categorizer_trainer(mitie_file)
        trainer.num_threads = num_threads
        for example in training_data.intent_examples:
            tokens = mitie.tokenize(example["text"])
            trainer.add_labeled_text(tokens, example["intent"])

        if training_data.intent_examples:
            # we can not call train if there are no examples!
            self.clf = trainer.train()
    def find_entity(ent, text):
        import mitie

        tk = MitieTokenizer()
        tokens, offsets = tk.tokenize_with_offsets(text)
        if ent["start"] not in offsets:
            message = "Invalid entity {} in example '{}': entities must span whole tokens".format(
                ent, text)
            raise ValueError(message)
        start = offsets.index(ent["start"])
        _slice = text[ent["start"]:ent["end"]]
        val_tokens = mitie.tokenize(_slice)
        end = start + len(val_tokens)
        return start, end
Ejemplo n.º 14
0
 def tokenize_with_offsets(self, text):
     _text = text.encode('utf-8')
     offsets = []
     offset = 0
     tokens = [w.decode('utf-8') for w in tokenize(_text)]
     for tok in tokens:
         m = re.search(re.escape(tok), text[offset:], re.UNICODE)
         if m is None:
             message = "Invalid MITIE offset. Token '{}' in message '{}'.".format(str(tok),
                                                                                  str(text.encode('utf-8')))
             raise ValueError(message)
         offsets.append(offset + m.start())
         offset += m.end()
     return tokens, offsets
Ejemplo n.º 15
0
def mitie_extract_entities(text):
  if isinstance(text, unicode):
    text = unicodedata.normalize('NFKD', text).encode('ascii','ignore') # MITIE doesn't like unicode and can't set encoding yet
  entities = []
  tokens = mitie.tokenize(text)
  mitie_entities = mt.extract_entities(tokens)
  for e in mitie_entities:
      range = e[0]
      tag = e[1]
      entity_text = " ".join(tokens[i] for i in range)
      entity_record = {u'entity': entity_text, u'type': tag}
      if entity_record not in mitie_known_bad_entities():
        entities.append(entity_record)
  return entities
Ejemplo n.º 16
0
def mitie_context(text, ner_model):
    """
    Send text to MITIE NER, format the results, and return them with the 3 words
    on either side of every extracted entity.

    The context words can be used to filter results (e.g., if it says "the province of Aleppo", look
    for an admin area rather than a city.
    This version does not produce any HTML marked up text.
   
    Parameters
    ----------
    text: string
          The text to have its entities extracted
    ner_model: MITIE named entity extractor
               The NER model produced by `setup_mitie`
   
    Returns
    -------
    named_entities: dictionary
                    "entities" contains a list of dictionaries. Each of these 
                    dicts has keys "tag", "text", and "score".
    """
    text = text.encode("utf-8")
    tokens = mitie.tokenize(text)
    # eventually, handle different NER models.
    entities = ner_model.extract_entities(tokens)
    out = []
    for e in entities:
        range = e[0]
        tag = e[1]
        score = e[2]
        entity_text = str(" ").join(tokens[i] for i in range)
        beg_token = min(range)
        end_token = max(range)

        context = []
        for i in [3, 2, 1]:
            try:
                context.append(tokens[beg_token - i])
            except:
                pass
            try:
                context.append(tokens[end_token + i])
            except:
                pass

        out.append({u'tag': unicode(tag), u'text': entity_text, u'score': score,
                    u'context': context})
    return {"entities": out}
Ejemplo n.º 17
0
def get_entities(text, count):
    ner = named_entity_extractor(parent + '/../../lib/MITIE/MITIE-models/english/ner_model.dat')
    # Load a text file and convert it into a list of words.
    tokens = tokenize(text)

    entities = ner.extract_entities(tokens)
    entity_count = collections.Counter()

    for e in entities:
        range = e[0]
        entity_text = " ".join(tokens[i].decode() for i in range)
        if entity_text not in entity_count:
            entity_count[entity_text] = 1
        else:
            entity_count[entity_text] += 1
    return entity_count.most_common(count)
Ejemplo n.º 18
0
    def tokenize_with_offsets(self, text):
        # type: (Text) -> Tuple[List[Text], List[int]]
        from mitie import tokenize

        _text = text.encode('utf-8')
        offsets = []
        offset = 0
        tokens = [w.decode('utf-8') for w in tokenize(_text)]
        for tok in tokens:
            m = re.search(re.escape(tok), text[offset:], re.UNICODE)
            if m is None:
                message = "Invalid MITIE offset. Token '{}' in message '{}'.".format(str(tok),
                                                                                     str(text.encode('utf-8')))
                raise ValueError(message)
            offsets.append(offset + m.start())
            offset += m.end()
        return tokens, offsets
Ejemplo n.º 19
0
def talk_to_mitie(text, ner_model):
    """
    Send text to MITIE NER, format the results, and return them.

    Note: this code also creates an HTML version of the output with
    named entities highlighted. That output is not used in Mordecai.

    Returns
    -------
    named_entities: dictionary
                    "entities" contains a list of dictionaries. Each of these
                    dicts has keys "tag", "text", and "score".
    """
    text = text.encode("utf-8")
    tokens = mitie.tokenize(text)
    tokens.append(' x ')
    # eventually, handle different NER models here.
    entities = ner_model.extract_entities(tokens)
    out = []
    for e in entities:
        range = e[0]
        tag = e[1]
        score = e[2]
        entity_text = str(" ").join(tokens[i] for i in range)
        out.append({
            u'tag': unicode(tag),
            u'text': entity_text,
            u'score': score
        })
    for e in reversed(entities):
        range = e[0]
        tag = e[1]
        newt = tokens[range[0]]
        if len(range) > 1:
            for i in range:
                if i != range[0]:
                    newt += str(' ') + tokens[i]
        newt = (str('<span class="mitie-') + tag + str('">') + newt +
                str('</span>'))
        tokens = tokens[:range[0]] + [newt] + tokens[(range[-1] + 1):]
    del tokens[-1]
    html = str(' ').join(tokens)
    htmlu = unicode(html.decode("utf-8"))
    named_entities = {"entities": out, "html": htmlu}
    return named_entities
Ejemplo n.º 20
0
def talk_to_mitie(text, ner_model):
    """
    Send text to MITIE NER, format the results, and return them.

    Note: this code also creates an HTML version of the output with 
    named entities highlighted. That output is not used in Mordecai.

    Returns
    -------
    named_entities: dictionary
                    "entities" contains a list of dictionaries. Each of these 
                    dicts has keys "tag", "text", and "score".
    """
    text = text.encode("utf-8")
    tokens = mitie.tokenize(text)
    tokens.append(' x ')
    # eventually, handle different NER models here.
    entities = ner_model.extract_entities(tokens)
    out = []
    for e in entities:
        range = e[0]
        tag = e[1]
        score = e[2]
        entity_text = str(" ").join(tokens[i] for i in range)
        out.append({u'tag': unicode(tag), u'text': entity_text,
                    u'score': score})
    for e in reversed(entities):
        range = e[0]
        tag = e[1]
        newt = tokens[range[0]]
        if len(range) > 1:
            for i in range:
                if i != range[0]:
                    newt += str(' ') + tokens[i]
        newt = (str('<span class="mitie-') + tag + str('">') + newt +
                str('</span>'))
        tokens = tokens[:range[0]] + [newt] + tokens[(range[-1] + 1):]
    del tokens[-1]
    html = str(' ').join(tokens)
    htmlu = unicode(html.decode("utf-8"))
    named_entities = {"entities": out, "html": htmlu}
    return named_entities
Ejemplo n.º 21
0
    def train(self, training_data, mitie_file, num_threads):
        # type: (TrainingData, str, Optional[int]) -> None
        from mitie import ner_training_instance, ner_trainer, tokenize

        trainer = ner_trainer(mitie_file)
        trainer.num_threads = num_threads
        found_one_entity = False
        for example in training_data.entity_examples:
            text = example["text"]
            tokens = tokenize(text)
            sample = ner_training_instance(tokens)
            for ent in example["entities"]:
                start, end = MitieEntityExtractor.find_entity(ent, text)
                sample.add_entity(list(range(start, end)), ent["entity"])
                found_one_entity = True

            trainer.add(sample)
        # Mitie will fail to train if there is not a single entity tagged
        if found_one_entity:
            self.ner = trainer.train()
Ejemplo n.º 22
0
def get_mitie_entities(original_tweets):
    system_entities = set()
    original_index = 0
    previous_token_end = 0
    ner = mitie.named_entity_extractor(
        'MITIE/MITIE-models/english/ner_model.dat')

    original_tweets_clean = original_tweets.replace(u"’", "'")
    for tweet in original_tweets_clean.split("\n"):
        entity_start = None

        stripped_tweet = tweet.strip()
        if not stripped_tweet:
            continue

        tokens = mitie.tokenize(stripped_tweet)
        entities = ner.extract_entities(tokens)
        if entities:
            current_entity = entities.pop(0)
        else:
            current_entity = None
        for i, token in enumerate(tokens):
            unicode_token = token.decode('utf-8')
            original_index = original_tweets_clean.index(
                unicode_token, previous_token_end)
            if entity_start is not None and i == current_entity[0][-1] + 1:
                system_entities.add(
                    (entity_start, previous_token_end, current_entity[1]))
                entity_start = None
                if entities:
                    current_entity = entities.pop(0)
                else:
                    current_entity = None
            if current_entity is not None and i == current_entity[0][0]:
                entity_start = original_index
            previous_token_end = original_index + len(unicode_token)
        if entity_start is not None:
            system_entities.add(
                (entity_start, previous_token_end, current_entity[1]))
    return system_entities
Ejemplo n.º 23
0
def parse_document(document, information):
    print 'Acquiring extractor'
    extractor = named_entity_extractor(os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat'))

    print 'Cleaning HTML'
    cleaned_document = clean_html(document)

    print 'Tokenizing'
    tokens = filter(None, tokenize(cleaned_document))

    print 'Extracting NER Entities'
    entities = extractor.extract_entities(tokens)

    print 'Done'
    normalizes_entities = []

    for position, tag in entities:
        position_indices = list(position)
        # TODO: join how for other languages?
        words = ' '.join(tokens[position_indices[0]:position_indices[-1]])
        if words:
            normalizes_entities.append((tag, words))

    return normalizes_entities
Ejemplo n.º 24
0
    def parse(self, text):
        tokens = tokenize(text)
        intent = self.get_intent(tokens)
        entities = self.get_entities(tokens)

        return {'intent': intent, 'entities': entities}
Ejemplo n.º 25
0
 def tokenize(self, text):
     return [w.decode('utf-8') for w in tokenize(text.encode('utf-8'))]
Ejemplo n.º 26
0
    def tokenize(self, text):
        # type: (Text) -> List[Text]
        from mitie import tokenize

        return [w.decode('utf-8') for w in tokenize(text.encode('utf-8'))]
Ejemplo n.º 27
0
 def tokenize(self, text):
     return tokenize(text)
Ejemplo n.º 28
0
def test(text):
    tokens = tokenize(text)
    res = mitie_extract_ner_series(tokens)
    return res
Ejemplo n.º 29
0
    def tokenize(self, text):
        # type: (str) -> [str]
        from mitie import tokenize

        return [w.decode('utf-8') for w in tokenize(text.encode('utf-8'))]
Ejemplo n.º 30
0
    def tokenize(self, text):
        # type: (Text) -> List[Text]
        from mitie import tokenize

        return [w.decode('utf-8') for w in tokenize(text.encode('utf-8'))]
Ejemplo n.º 31
0
def process(text, model):
    tokens = mitie.tokenize(text)
    entities = model.extract_entities(tokens)
    return tokens, entities
Ejemplo n.º 32
0
# named entity recognition outputs.
#
import sys, os
# Make sure you put the mitielib folder into the python search path.  There are
# a lot of ways to do this, here we do it programmatically with the following
# two statements:
parent = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parent + '/lib/MITIE/mitielib')

print parent

import mitie

print "loading NER model..."
ner = mitie.named_entity_extractor('lib/MITIE/english/ner_model.dat')
print "\nTags output by this NER model:", ner.get_possible_ner_tags()

# Load a text file and convert it into a list of words.
tokens = mitie.tokenize(mitie.load_entire_file('lib/MITIE/sample_text.txt'))
print "Tokenized input:", tokens

entities = ner.extract_entities(tokens)
print "\nEntities found:", entities
print "\nNumber of entities detected:", len(entities)

for e in entities:
    range = e[0]
    tag = e[1]
    entity_text = " ".join(tokens[i] for i in range)
    print "    " + tag + ": " + entity_text