def __init__(self, pipeline, language): self.tagger = pipeline.turbo_interface.create_tagger() self.parser = pipeline.turbo_interface.create_parser() self.lemmatizer = None if language == 'PT': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/portuguese.pickle') self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer( ) self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model' ) self.lemmatizer = lemmatizer.BasicLemmatizer() self.lemmatizer.load_lemmatizer_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model' ) elif language == 'PT-Cintil': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/portuguese.pickle') self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer() self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_parser_pruned-true_model-standard.model' ) elif language == 'ES': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/spanish.pickle') self.word_tokenizer = nltk.TreebankWordTokenizer() # For now... self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model' ) self.lemmatizer = lemmatizer.BasicLemmatizer() self.lemmatizer.load_lemmatizer_model( '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model' ) elif language == 'EN': self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') self.word_tokenizer = nltk.TreebankWordTokenizer() self.tagger.load_tagger_model( '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model' ) self.parser.load_parser_model( '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_parser_pruned-true_model-standard.model' ) else: raise NotImplementedError
def __init__(self, pipeline, language): self.tagger = None self.parser = None self.semantic_parser = None self.lemmatizer = None if language not in pipeline.models: print 'Error: no model for language %s.' % language raise NotImplementedError if 'splitter' in pipeline.models[language]: self.sent_tokenizer = nltk.data.load(pipeline.models[language]['splitter']) else: # If no splitter is specified, use the English model. self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if language == 'PT': self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer() elif language == 'PT-Cintil': self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer() else: self.word_tokenizer = nltk.TreebankWordTokenizer() # For now... if 'tagger' in pipeline.models[language]: self.tagger = pipeline.turbo_interface.create_tagger() self.tagger.load_tagger_model(pipeline.models[language]['tagger']) if 'parser' in pipeline.models[language]: self.parser = pipeline.turbo_interface.create_parser() self.parser.load_parser_model(pipeline.models[language]['parser']) if 'lemmatizer' in pipeline.models[language]: self.lemmatizer = lemmatizer.BasicLemmatizer() self.lemmatizer.load_lemmatizer_model(pipeline.models[language]['lemmatizer']) if 'semantic_parser' in pipeline.models[language]: self.semantic_parser = pipeline.turbo_interface.create_semantic_parser() self.semantic_parser.load_semantic_parser_model(pipeline.models[language]['semantic_parser'])
def __init__(self, pipeline, language): self.tagger = None self.morphological_tagger = None self.entity_recognizer = None self.parser = None self.semantic_parser = None self.lemmatizer = None self.coreference_resolver = None if language not in pipeline.models: print('Error: no model for language %s.' % language) raise NotImplementedError if 'splitter' in pipeline.models[language]: self.sent_tokenizer = nltk.data.load( pipeline.models[language]['splitter']) else: # If no splitter is specified, use the English model. self.sent_tokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') if 'tokenizer' in pipeline.models[language]: tokenizer_language = pipeline.models[language]['tokenizer'] self.word_tokenizer = \ UniversalWordTokenizer(language=tokenizer_language) else: self.word_tokenizer = UniversalWordTokenizer(language='none') if 'tagger' in pipeline.models[language]: self.tagger = pipeline.turbo_interface.create_tagger() self.tagger.load_tagger_model(pipeline.models[language]['tagger']) if 'morphological_tagger' in pipeline.models[language]: self.morphological_tagger = pipeline.turbo_interface.create_morphological_tagger( ) self.morphological_tagger.load_morphological_tagger_model( pipeline.models[language]['morphological_tagger']) if 'entity_recognizer' in pipeline.models[language]: self.entity_recognizer = pipeline.turbo_interface.create_entity_recognizer( ) self.entity_recognizer.load_entity_recognizer_model( pipeline.models[language]['entity_recognizer']) if 'parser' in pipeline.models[language]: self.parser = pipeline.turbo_interface.create_parser() self.parser.load_parser_model(pipeline.models[language]['parser']) if 'lemmatizer' in pipeline.models[language]: self.lemmatizer = lemmatizer.BasicLemmatizer() self.lemmatizer.load_lemmatizer_model( pipeline.models[language]['lemmatizer']) if 'semantic_parser' in pipeline.models[language]: self.semantic_parser = pipeline.turbo_interface.create_semantic_parser( ) self.semantic_parser.load_semantic_parser_model( pipeline.models[language]['semantic_parser']) if 'coreference_resolver' in pipeline.models[language]: self.coreference_resolver = pipeline.turbo_interface.create_coreference_resolver( ) self.coreference_resolver.load_coreference_resolver_model( pipeline.models[language]['coreference_resolver'])