def get_named_entities_sents(self, sents): dir_path = os.path.dirname(os.path.realpath(__file__)) #print("ner: current working directory is ", dir_path) ner_tagger_path = dir_path + r"/resources/stanford-ner.jar" german_model = dir_path + r"/resources/german.conll.hgc_175m_600.crf.ser.gz" #print(ner_tagger_path) tagger = StanfordNERTagger(german_model, ner_tagger_path, encoding="UTF-8") # iso-8859-15 tagger.java_options = '-mx2048 -Xmx2048m -Xms2048m' nltk.internals.config_java(options='-xmx2G') print("Running named entity recognition on sentences") t0 = time() self.named_entities = tagger.tag_sents(sents) print(len(self.named_entities), " named entitites found") print("done in %0.3fs" % (time() - t0)) return self.sort_named_entities()
path_to_jar = os.path.join(current_path, '..', path_to_jar) standford_tagger = StanfordPOSTagger(path_to_model, path_to_jar) standford_tagger.java_options = '-mx1024m' ### Setting higher memory limit for long sentences # https://pythonprogramming.net/named-entity-recognition-stanford-ner-tagger/ from nltk.tag import StanfordNERTagger path_to_model = "input/stanford/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz" path_to_jar = "input/stanford/stanford-ner-2014-08-27/stanford-ner.jar" path_to_model = os.path.join(current_path, '..', path_to_model) path_to_jar = os.path.join(current_path, '..', path_to_jar) standford_ner = StanfordNERTagger(path_to_model, path_to_jar) standford_ner.java_options = '-mx1024m' ### Setting higher memory limit for long sentences # ------------------------------------------------------------------------------ # functions # ------------------------------------------------------------------------------ # tokenisation def tokeniser(text): return nltk.word_tokenize(text) # tagging def tn_tagger(tokenised_text, tagger='stanford'): if tagger == 'nltk': return pos_tag(tokenised_text) elif tagger == 'stanford':