Beispiel #1
0
    for word in unique_terms:
        word_tf.append(collection.tf(word, document))
    return word_tf


stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()

java_path = 'C:/Program Files (x86)/Java/jre1.8.0_101/bin/'
os.environ['JAVA_HOME'] = java_path
stanford_dir = 'C:/stanford-ner-2016-10-31/'
jarfile = stanford_dir + 'stanford-ner.jar'
modelfile = stanford_dir + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
st = StanfordNERTagger(modelfile, jarfile)
stanford_jars = find_jars_within_path(stanford_dir)
st._stanford_jar = ';'.join(stanford_jars)

if __name__ == "__main__":
    folder = "Thomas_Baker"
    # Empty list to hold text documents.
    texts = []

    listing = os.listdir(folder)
    for file in sorted(listing):
        if file.endswith(".txt"):
            url = folder + "/" + file
            f = open(url, encoding="latin-1")
            raw = f.read()
            f.close()
            tokens = nltk.word_tokenize(raw)
Beispiel #2
0
#!/bin/env python3.5
from nltk.tag.stanford import StanfordNERTagger
from nltk.internals import find_jars_within_path
from nltk.tokenize import sent_tokenize
import os

tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar')
tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09'))
print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))