Ejemplo n.º 1
0
    def load(
        cls,
        config: Dict[Text, Any],
        model_storage: ModelStorage,
        resource: Resource,
        execution_context: ExecutionContext,
        **kwargs: Any,
    ) -> MitieEntityExtractor:
        """Loads trained component (see parent class for full docstring)."""
        import mitie

        try:
            with model_storage.read_from(resource) as model_path:
                ner_file = model_path / cls.MITIE_RESOURCE_FILE
                if not ner_file.exists():
                    raise FileNotFoundError(
                        f"Expected a MITIE extractor file at {ner_file}.")
                ner = mitie.named_entity_extractor(str(ner_file))
                return cls(config, model_storage, resource, ner=ner)

        except (FileNotFoundError, ValueError) as e:
            logger.debug(
                f"Failed to load {cls.__name__} from model storage. "
                f"This can happen if the model could not be trained because regexes "
                f"could not be extracted from the given training data - and hence "
                f"could not be persisted. Error: {e}.")
            return cls(config, model_storage, resource)
Ejemplo n.º 2
0
def parse_document(document, information):
    print 'Acquiring extractor'
    extractor = named_entity_extractor(
        os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat'))

    print 'Cleaning HTML'
    cleaned_document = clean_html(document)

    print 'Tokenizing'
    tokens = filter(None, tokenize(cleaned_document))

    print 'Extracting NER Entities'
    entities = extractor.extract_entities(tokens)

    print 'Done'
    normalizes_entities = []

    for position, tag in entities:
        position_indices = list(position)
        # TODO: join how for other languages?
        words = ' '.join(tokens[position_indices[0]:position_indices[-1]])
        if words:
            normalizes_entities.append((tag, words))

    return normalizes_entities
 def __init__(self, intent_classifier_file=None, entity_extractor_file=None, feature_extractor_file=None, **kwargs):
     if entity_extractor_file:
         self.extractor = named_entity_extractor(entity_extractor_file)  # ,metadata["feature_extractor"])
     with open(intent_classifier_file, 'rb') as f:
         self.classifier = cloudpickle.load(f)
     self.featurizer = MITIEFeaturizer(feature_extractor_file)
     self.tokenizer = MITIETokenizer()
Ejemplo n.º 4
0
    def load(self):
        user_dir = os.path.expanduser('~')
        mitieModelFilePath = os.path.join(user_dir, '.verbis/models/mitie',
                                          'english', 'ner_model.dat')
        mitie_model = named_entity_extractor(mitieModelFilePath)

        return mitie_model
Ejemplo n.º 5
0
    def load(meta, featurizer=None):
        """
        :type meta: rasa_nlu.model.Metadata
        :rtype: MITIESklearnInterpreter
        """
        if meta.entity_extractor_path:
            extractor = named_entity_extractor(meta.entity_extractor_path)
        else:
            extractor = None

        if featurizer is None:
            featurizer = MITIEFeaturizer(meta.feature_extractor_path)

        if meta.intent_classifier_path:
            with open(meta.intent_classifier_path, 'rb') as f:
                classifier = cloudpickle.load(f)
        else:
            classifier = None
        if meta.entity_synonyms_path:
            entity_synonyms = Interpreter.load_synonyms(
                meta.entity_synonyms_path)
        else:
            entity_synonyms = None

        return MITIESklearnInterpreter(classifier, extractor, featurizer,
                                       entity_synonyms)
Ejemplo n.º 6
0
def setup_mitie(mitie_directory, mitie_ner_model):
    """
    Given the location for MITIE and the model, create a named_entity_extractor
    object.
    """
    sys.path.append(mitie_directory)
    ner_model = mitie.named_entity_extractor(mitie_ner_model)
    return ner_model
Ejemplo n.º 7
0
    def __init__(self, model="test_data/edr_ner_model_gigaword_embeddings.dat"):
        """
        Initializes class and MITIE model
        :param model: filepath of MITIE model
        :type model: str
        """

        self.ner = named_entity_extractor(model)
        super(MITIEBasedParser, self).__init__()
Ejemplo n.º 8
0
    def load(cls, model_dir, model_metadata, cached_component, **kwargs):
        # type: (Text, Metadata, Optional[MitieEntityExtractor], **Any) -> MitieEntityExtractor
        import mitie

        if model_dir and model_metadata.get("entity_extractor_mitie"):
            entity_extractor_file = os.path.join(model_dir, model_metadata.get("entity_extractor_mitie"))
            extractor = mitie.named_entity_extractor(entity_extractor_file)
            return MitieEntityExtractor(extractor)
        else:
            return MitieEntityExtractor()
Ejemplo n.º 9
0
    def load(cls, model_dir, model_metadata, cached_component, **kwargs):
        # type: (Text, Metadata, Optional[MitieEntityExtractor], **Any) -> MitieEntityExtractor
        import mitie

        if model_dir and model_metadata.get("entity_extractor_mitie"):
            entity_extractor_file = os.path.join(model_dir, model_metadata.get("entity_extractor_mitie"))
            extractor = mitie.named_entity_extractor(entity_extractor_file)
            return MitieEntityExtractor(extractor)
        else:
            return MitieEntityExtractor()
Ejemplo n.º 10
0
    def load(cls, model_dir, entity_extractor):
        # type: (str, str) -> MitieEntityExtractor
        from mitie import named_entity_extractor

        if model_dir and entity_extractor:
            entity_extractor_file = os.path.join(model_dir, entity_extractor)
            extractor = named_entity_extractor(entity_extractor_file)
            return MitieEntityExtractor(extractor)
        else:
            return MitieEntityExtractor()
Ejemplo n.º 11
0
def _load_ner():
    global ner
    if settings.NLP_ENABLED:
        try:
            if not (settings.NLP_LIBRARY_PATH in sys.path):
                sys.path.append(settings.NLP_LIBRARY_PATH)
            if not ner:
                from mitie import named_entity_extractor
                ner = named_entity_extractor(settings.NLP_MODEL_PATH)
        except BaseException:
            print "Could not load the NLP NER"
Ejemplo n.º 12
0
def _load_ner():
    global ner
    if settings.NLP_ENABLED:
        try:
            if not (settings.NLP_LIBRARY_PATH in sys.path):
                sys.path.append(settings.NLP_LIBRARY_PATH)
            if not ner:
                from mitie import named_entity_extractor
                ner = named_entity_extractor(settings.NLP_MODEL_PATH)
        except:
            print "Could not load the NLP NER"
    def load(cls, model_dir, entity_extractor_mitie):
        # type: (Text, Text) -> MitieEntityExtractor
        import mitie

        if model_dir and entity_extractor_mitie:
            entity_extractor_file = os.path.join(model_dir,
                                                 entity_extractor_mitie)
            extractor = mitie.named_entity_extractor(entity_extractor_file)
            return MitieEntityExtractor(extractor)
        else:
            return MitieEntityExtractor()
Ejemplo n.º 14
0
def test_mitie_context():
    __location__ = os.path.realpath(os.path.join(os.getcwd(),
                                                 os.path.dirname(__file__)))
    config_file = glob.glob(os.path.join(__location__, '../../config.ini'))
    parser = ConfigParser()
    parser.read(config_file)
    mitie_directory = parser.get('Locations', 'mitie_directory')
    mitie_ner_model = parser.get('Locations', 'mitie_ner_model')
    sys.path.append(mitie_directory)
    ner_model = named_entity_extractor(mitie_ner_model)
    text = "The meeting happened in Ontario."
    mc = mitie_context(text, ner_model)
    mc_gold = {u'entities': [{u'text': 'Ontario', u'tag': u'LOCATION', u'score': 1.3923831181343844, u'context': ['meeting', 'happened', 'in', '.']}]}
    assert mc == mc_gold
Ejemplo n.º 15
0
def get_entities(text, count):
    ner = named_entity_extractor(parent + '/../../lib/MITIE/MITIE-models/english/ner_model.dat')
    # Load a text file and convert it into a list of words.
    tokens = tokenize(text)

    entities = ner.extract_entities(tokens)
    entity_count = collections.Counter()

    for e in entities:
        range = e[0]
        entity_text = " ".join(tokens[i].decode() for i in range)
        if entity_text not in entity_count:
            entity_count[entity_text] = 1
        else:
            entity_count[entity_text] += 1
    return entity_count.most_common(count)
Ejemplo n.º 16
0
def main():
    try:
        log('Loading %s', MODEL)
        global NER
        NER = mitie.named_entity_extractor(MODEL)
    except Exception as error:
        log('Can not load model: "%s"', error)
        return

    server = HTTPServer((HOST, PORT), HTTPHandler)
    try:
        log('Listening http://%s:%d', HOST, PORT)
        server.serve_forever()
    except KeyboardInterrupt:
        log('Quiting')
    finally:
        server.server_close()
Ejemplo n.º 17
0
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Text = None,
             model_metadata: Metadata = None,
             cached_component: Optional['MitieEntityExtractor'] = None,
             **kwargs: Any) -> 'MitieEntityExtractor':
        import mitie

        file_name = meta.get("file")

        if not file_name:
            return cls(meta)

        classifier_file = os.path.join(model_dir, file_name)
        if os.path.exists(classifier_file):
            extractor = mitie.named_entity_extractor(classifier_file)
            return cls(meta, extractor)
        else:
            return cls(meta)
 def __init__(self,
              intent_classifier=None,
              entity_extractor=None,
              feature_extractor=None,
              entity_synonyms=None,
              **kwargs):
     self.extractor = None
     self.classifier = None
     if entity_extractor:
         self.extractor = named_entity_extractor(entity_extractor,
                                                 feature_extractor)
     if intent_classifier:
         with open(intent_classifier, 'rb') as f:
             self.classifier = cloudpickle.load(f)
     self.featurizer = MITIEFeaturizer(feature_extractor)
     self.tokenizer = MITIETokenizer()
     self.ent_synonyms = None
     if entity_synonyms:
         self.ent_synonyms = Interpreter.load_synonyms(entity_synonyms)
    def load(cls,
             model_dir: Text = None,
             model_metadata: Metadata = None,
             cached_component: Optional['MitieEntityExtractor'] = None,
             **kwargs: Any) -> 'MitieEntityExtractor':
        import mitie

        meta = model_metadata.for_component(cls.name)

        file_name = meta.get("classifier_file", MITIE_ENTITY_MODEL_FILE_NAME)

        if not file_name:
            return cls(meta)

        classifier_file = os.path.join(model_dir, file_name)
        if os.path.exists(classifier_file):
            extractor = mitie.named_entity_extractor(classifier_file)
            return cls(meta, extractor)
        else:
            return cls(meta)
Ejemplo n.º 20
0
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Text = None,
             model_metadata: Metadata = None,
             cached_component: Optional['MitieEntityExtractor'] = None,
             **kwargs: Any
             ) -> 'MitieEntityExtractor':
        import mitie

        file_name = meta.get("file")

        if not file_name:
            return cls(meta)

        classifier_file = os.path.join(model_dir, file_name)
        if os.path.exists(classifier_file):
            extractor = mitie.named_entity_extractor(classifier_file)
            return cls(meta, extractor)
        else:
            return cls(meta)
Ejemplo n.º 21
0
def test_mitie_context():
    __location__ = os.path.realpath(
        os.path.join(os.getcwd(), os.path.dirname(__file__)))
    config_file = glob.glob(os.path.join(__location__, '../../config.ini'))
    parser = ConfigParser()
    parser.read(config_file)
    mitie_directory = parser.get('Locations', 'mitie_directory')
    mitie_ner_model = parser.get('Locations', 'mitie_ner_model')
    sys.path.append(mitie_directory)
    ner_model = named_entity_extractor(mitie_ner_model)
    text = "The meeting happened in Ontario."
    mc = mitie_context(text, ner_model)
    mc_gold = {
        u'entities': [{
            u'text': 'Ontario',
            u'tag': u'LOCATION',
            u'score': 1.3923831181343844,
            u'context': ['meeting', 'happened', 'in', '.']
        }]
    }
    assert mc == mc_gold
Ejemplo n.º 22
0
def get_mitie_entities(original_tweets):
    system_entities = set()
    original_index = 0
    previous_token_end = 0
    ner = mitie.named_entity_extractor(
        'MITIE/MITIE-models/english/ner_model.dat')

    original_tweets_clean = original_tweets.replace(u"’", "'")
    for tweet in original_tweets_clean.split("\n"):
        entity_start = None

        stripped_tweet = tweet.strip()
        if not stripped_tweet:
            continue

        tokens = mitie.tokenize(stripped_tweet)
        entities = ner.extract_entities(tokens)
        if entities:
            current_entity = entities.pop(0)
        else:
            current_entity = None
        for i, token in enumerate(tokens):
            unicode_token = token.decode('utf-8')
            original_index = original_tweets_clean.index(
                unicode_token, previous_token_end)
            if entity_start is not None and i == current_entity[0][-1] + 1:
                system_entities.add(
                    (entity_start, previous_token_end, current_entity[1]))
                entity_start = None
                if entities:
                    current_entity = entities.pop(0)
                else:
                    current_entity = None
            if current_entity is not None and i == current_entity[0][0]:
                entity_start = original_index
            previous_token_end = original_index + len(unicode_token)
        if entity_start is not None:
            system_entities.add(
                (entity_start, previous_token_end, current_entity[1]))
    return system_entities
Ejemplo n.º 23
0
    def load(
        cls,
        meta: Dict[Text, Any],
        model_dir: Text,
        model_metadata: Metadata = None,
        cached_component: Optional["MitieEntityExtractor"] = None,
        **kwargs: Any,
    ) -> "MitieEntityExtractor":
        """Loads trained component (see parent class for full docstring)."""
        import mitie

        file_name = meta.get("file")

        if not file_name:
            return cls(meta)

        classifier_file = os.path.join(model_dir, file_name)
        if os.path.exists(classifier_file):
            extractor = mitie.named_entity_extractor(classifier_file)
            return cls(meta, extractor)
        else:
            return cls(meta)
Ejemplo n.º 24
0
    def load(cls,
             model_dir=None,  # type: Text
             model_metadata=None,  # type: Metadata
             cached_component=None,  # type: Optional[MitieEntityExtractor]
             **kwargs  # type: **Any
             ):
        # type: (...) -> MitieEntityExtractor
        import mitie

        meta = model_metadata.for_component(cls.name)

        file_name = meta.get("classifier_file", MITIE_ENTITY_MODEL_FILE_NAME)

        if not file_name:
            return cls(meta)

        classifier_file = os.path.join(model_dir, file_name)
        if os.path.exists(classifier_file):
            extractor = mitie.named_entity_extractor(classifier_file)
            return cls(meta, extractor)
        else:
            return cls(meta)
Ejemplo n.º 25
0
def parse_document(document, information):
    print 'Acquiring extractor'
    extractor = named_entity_extractor(os.path.join(mitie_location, 'MITIE-models/english/ner_model.dat'))

    print 'Cleaning HTML'
    cleaned_document = clean_html(document)

    print 'Tokenizing'
    tokens = filter(None, tokenize(cleaned_document))

    print 'Extracting NER Entities'
    entities = extractor.extract_entities(tokens)

    print 'Done'
    normalizes_entities = []

    for position, tag in entities:
        position_indices = list(position)
        # TODO: join how for other languages?
        words = ' '.join(tokens[position_indices[0]:position_indices[-1]])
        if words:
            normalizes_entities.append((tag, words))

    return normalizes_entities
    def load_pretrained_model(self,
                              modelname='pretrained-StanfordNER',
                              numclass=3):
        """ Loads a pre-trained model.

        Parameters
        ----------
        modelname : str
            The name of the pre-trained model to use. The options are:
                * 'pretrained-StanfordNER': Used a CRF and word embeddings.
                    See: https://nlp.stanford.edu/software/CRF-NER.shtml
                * 'pretrained-MITIE': Used Structural SVMs and word embeddings.
                    Uses Dhillon et al's "eigenwords" word embeddings.
                    See: https://github.com/mit-nlp/MITIE
                * 'pretrained-SENNA': Used multilayer perceptrons and the
                    50-dimensional CW (2008) word embeddings.
                    See: http://ml.nec-labs.com/senna/
                * 'pretrained-spacy': Used BILOU scheme; the algorithm is "a
                    pastiche of well-known methods...a greedy transition-based
                    parser guided by a linear model whose weights are learned
                    using the averaged perceptron loss, via the dynamic oracle
                    imitation strategy". See:
                    https://spacy.io/docs/usage/entity-recognition.
                    Using pre-trained model 'en_core_web_sm' here.
                    NOTE: could try 'en_depent_web_md' instead.
        numclass : int
            The number of classes for the pre-trained classifier; this is
            relevant only when modelname is 'pretrained-StanfordNER'.

        """
        self.pretrained_model = modelname
        self.transfer_method = 'none'
        if modelname == 'pretrained-StanfordNER':
            if numclass == 3:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.all.3class.distsim.crf.ser.gz')  #,
                #STANFORD_CLASSPATH)
                self.model['entity_types'] = ['LOC', 'ORG', 'PER']
                self.model['training_corpus'] = [
                    'CONLL03 eng.train', 'MUC6 train', 'MUC7 train', 'ACE2002',
                    'in-house data'
                ]
            elif numclass == 4:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.conll.4class.distsim.crf.ser.gz')  #,
                ##STANFORD_CLASSPATH)
                self.model['entity_types'] = ['LOC', 'PER', 'ORG', 'MISC']
                self.model['training_corpus'] = ['CONLL03 eng.train']
            elif numclass == 7:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.muc.7class.distsim.crf.ser.gz')  #,
                ##STANFORD_CLASSPATH)
                self.model['entity_types'] = [
                    'LOC',
                    'ORG',
                    'PER',
                    'MISC',
                    'MON',  # MONEY
                    'PCT',  # PERCENT
                    'DAT',  # DATE
                    'TIM'
                ]  # TIME
                self.model['training_corpus'] = ['MUC6 train', 'MUC7 train']
            else:
                raise ValueError(
                    'When using StanfordNER, numclass must be 3, 4 or 7.')

        elif modelname == 'pretrained-MITIE':
            self.NER = mitie.named_entity_extractor(MITIE_MODEL_DIR)
            self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC']
            self.model['training_corpus'] = ['?']

        elif modelname == 'pretrained-SENNA':
            self.NER = SennaNERTagger(SENNA_DIR)
            self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC']
            self.model['training_corpus'] = ["?"]

        elif modelname == 'pretrained-spacy':
            self.NER = None
            self.model['entity_types'] = [
                'PER',  # PERSON
                'NOR',  # NORP
                'FAC',  # FACILITY
                'ORG',  # ORGANIZATION
                'GPE',  # GEO-POLITICAL
                'LOC',  # LOCATION
                'PRO',  # PRODUCT
                'EVE',  # EVENT
                'WOR',  # WORK OF ART
                'LAN',  # LANGUAGE
                'DAT',  # DATE
                'TIM',  # TIME
                'PCT',  # PERCENT
                'MON',  # MONEY
                'QUA',  # QUANTITY
                'ORD',  # ORDINAL
                'CAR'
            ]  # CARDINAL
            self.model['training_corpus'] = ["?"]
        else:
            raise ValueError("Wrong modelname; must be 'pretrained-spacy',\
                             'pretrained-SENNA', 'pretrained-MITIE',\
                             or 'pretrained-StanfordNER'.")
Ejemplo n.º 27
0
 def __init__(self):
     self.extractor = named_entity_extractor(NamedEntityExtractor.NE_DATA)
     self.tokenizer = tokenize
def extract_entities(doc_iter, extract_field='body', extracted_lang_field='body_lang', extracted_translated_field="body_translated"):
    sys.path.append(".")
    from mitie import tokenize_with_offsets, named_entity_extractor
    print "loading NER models..."

    ner_models={}
    ner_models['en'] = named_entity_extractor('ner_model_english.dat')
    ner_models['es'] = named_entity_extractor('ner_model_spanish.dat')

    def entities(extracted_text, lang):
            extracted_text = re.sub(r'[^\x00-\x7F]',' ', extracted_text)
            extracted_text = extracted_text.replace("[:newline:]", "           ")
            extracted_text = extracted_text.encode("ascii")
            #tokens = tokenize(body)
            tokens = tokenize_with_offsets(extracted_text)

            entities_markup = ner_models[lang].extract_entities(tokens)
            #results contains [(tag, entity, offset, score)]
            results = [
                (tag, " ".join([tokens[i][0] for i in rng]), ",".join([str(tokens[i][1]) for i in rng]), "{0:.2f}".format(score))
                for rng, tag, score in entities_markup ]

            entity_doc = {}
            # entity_doc["entity_full"] = results
            entity_doc["entity_all"] = []
            entity_doc["entity_location"] = []
            entity_doc["entity_organization"] = []
            entity_doc["entity_person"] = []
            entity_doc["entity_misc"] = []

            for tag, entity, rng, score in results:
                if len(entity) > 30:
                    continue

                entity_doc["entity_all"].append(entity)

                if tag == 'LOCATION' and score > 0.3:
                    entity_doc["entity_location"].append(entity)
                elif tag == 'ORGANIZATION' and score > 0.5:
                    entity_doc["entity_organization"].append(entity)
                elif tag == 'PERSON' and score > 0.3:
                    entity_doc["entity_person"].append(entity)
                elif score > 0.5:
                    entity_doc["entity_misc"].append(entity)
            return entity_doc

    for doc in doc_iter:
        doc_id = doc["id"]
        if extract_field in doc:
            lang = doc.get(extracted_lang_field, 'en')

            # TODO Hack to ensure at least en is run
            lang = lang if lang in ner_models else 'en'

            mitie_entities = entities(doc[extract_field], lang)
            doc["entities"] = {extract_field+"_entities" : mitie_entities}
            doc["entities"]["original_lang"] = lang
            doc["entities"][extract_field+"_entities_translated"] = {}

        #     Now extract entities for any translated fields
            if not lang == 'en':
                mitie_entities = entities(doc[extracted_translated_field], 'en')
                doc["entities"][extract_field+"_entities_translated"] = mitie_entities

        # TODO do attachments here instead of in a seperate execution of this stage
        yield doc
Ejemplo n.º 29
0
 def load(self, file_name):
     self.model = named_entity_extractor(file_name)
Ejemplo n.º 30
0
 def load(self):
     self.ner = mitie.named_entity_extractor('models/ner_model.dat')
Ejemplo n.º 31
0
def extract_entities(doc_iter,
                     extract_field='body',
                     extracted_lang_field='body_lang',
                     extracted_translated_field="body_translated"):
    sys.path.append(".")
    from mitie import tokenize_with_offsets, named_entity_extractor
    print "loading NER models..."

    ner_models = {}
    ner_models['en'] = named_entity_extractor(
        '/usr/src/app/static-data/ner_model_english.dat')
    ner_models['es'] = named_entity_extractor(
        '/usr/src/app/static-data/ner_model_spanish.dat')

    def entities(extracted_text, lang):
        extracted_text = re.sub(r'[^\x00-\x7F]', ' ', extracted_text)
        extracted_text = extracted_text.replace("[:newline:]", "           ")
        extracted_text = extracted_text.encode("ascii")
        #tokens = tokenize(body)
        tokens = tokenize_with_offsets(extracted_text)

        entities_markup = ner_models[lang].extract_entities(tokens)
        #results contains [(tag, entity, offset, score)]
        results = [(tag, " ".join([tokens[i][0] for i in rng]),
                    ",".join([str(tokens[i][1])
                              for i in rng]), "{0:.2f}".format(score))
                   for rng, tag, score in entities_markup]

        entity_doc = {}
        # entity_doc["entity_full"] = results
        entity_doc["entity_all"] = []
        entity_doc["entity_location"] = []
        entity_doc["entity_organization"] = []
        entity_doc["entity_person"] = []
        entity_doc["entity_misc"] = []

        for tag, entity, rng, score in results:
            if len(entity) > 30:
                continue

            entity_doc["entity_all"].append(entity)

            if tag == 'LOCATION' and score > 0.3:
                entity_doc["entity_location"].append(entity)
            elif tag == 'ORGANIZATION' and score > 0.5:
                entity_doc["entity_organization"].append(entity)
            elif tag == 'PERSON' and score > 0.3:
                entity_doc["entity_person"].append(entity)
            elif score > 0.5:
                entity_doc["entity_misc"].append(entity)
        return entity_doc

    for doc in doc_iter:
        doc_id = doc["id"]
        if extract_field in doc:
            lang = doc.get(extracted_lang_field, 'en')

            # TODO Hack to ensure at least en is run
            lang = lang if lang in ner_models else 'en'

            mitie_entities = entities(doc[extract_field], lang)
            doc["entities"] = {extract_field + "_entities": mitie_entities}
            doc["entities"]["original_lang"] = lang
            doc["entities"][extract_field + "_entities_translated"] = {}

            #     Now extract entities for any translated fields
            if not lang == 'en':
                mitie_entities = entities(doc[extracted_translated_field],
                                          'en')
                doc["entities"][extract_field +
                                "_entities_translated"] = mitie_entities

        # TODO do attachments here instead of in a seperate execution of this stage
        yield doc
Ejemplo n.º 32
0
def setup_mitie(mitie_directory):
    """ Given the location for MITIE and the model, create a named_entity_extractor object."""
    sys.path.append(mitie_directory)
    ner_model = mitie.named_entity_extractor(mitie_ner_model)
    return ner_model
Ejemplo n.º 33
0
 def __init__(self, metadata):
     self.extractor = named_entity_extractor(
         metadata["entity_extractor"])  # ,metadata["feature_extractor"])
     self.classifier = text_categorizer(
         metadata["intent_classifier"])  # ,metadata["feature_extractor"])
     self.tokenizer = MITIETokenizer()
Ejemplo n.º 34
0
# recognition and also how to run a binary relation detector on top of the
# named entity recognition outputs.
#
import sys, os
# Make sure you put the mitielib folder into the python search path.  There are
# a lot of ways to do this, here we do it programmatically with the following
# two statements:
parent = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parent + '/lib/MITIE/mitielib')

print parent

import mitie

print "loading NER model..."
ner = mitie.named_entity_extractor('lib/MITIE/english/ner_model.dat')
print "\nTags output by this NER model:", ner.get_possible_ner_tags()

# Load a text file and convert it into a list of words.
tokens = mitie.tokenize(mitie.load_entire_file('lib/MITIE/sample_text.txt'))
print "Tokenized input:", tokens

entities = ner.extract_entities(tokens)
print "\nEntities found:", entities
print "\nNumber of entities detected:", len(entities)

for e in entities:
    range = e[0]
    tag = e[1]
    entity_text = " ".join(tokens[i] for i in range)
    print "    " + tag + ": " + entity_text