def Init(): parser = stanford.StanfordDependencyParser(model_path="./stanford_libs/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") stanford_dir = parser._classpath[0].rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) parser._classpath = tuple(find_jars_within_path(stanford_dir)) '''st = StanfordNERTagger('./stanford_libs/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', './stanford_libs/stanford-ner-2015-12-09/stanford-ner.jar') stanford_dir = st._stanford_jar.rpartition('/')[0] st._stanford_jar = ':'.join(find_jars_within_path(stanford_dir)) ''' stop = stopwords.words('english') return parser, None, stop
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn( 'The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?' ) self._stanford_jar = find_jar(self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS', ), verbose=verbose) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options
def nonlocal_ner_tag_tokens(self): home = expanduser("~") os.environ['CLASSPATH'] = home + '/stanford-ner-2015-12-09' os.environ[ 'STANFORD_MODELS'] = home + '/stanford-ner-2015-12-09/classifiers' st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz", java_options='-mx4000m') stanford_dir = st._stanford_jar[0].rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) # do not tokenise text nltk.internals.config_java( options= '-tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=true"' ) self.nonlocal_ner_doc_tokens = [] temp_nonlocal_bulk_process = [] length_of_docs = [len(doc) for doc in self.tokenized_docs_by_lines] for doc_idx, doc in enumerate(self.tokenized_docs_by_lines): for line_idx, line in enumerate(doc): temp_nonlocal_bulk_process.append(line) temp_nonlocal_bulk_process = st.tag_sents(temp_nonlocal_bulk_process) current_idx = 0 for doc_len_idx, doc_len in enumerate(length_of_docs): self.nonlocal_ner_doc_tokens.append( temp_nonlocal_bulk_process[current_idx:current_idx + doc_len]) current_idx += doc_len print("NER nonlocal tagged tokens")
def get_postag_with_index(sources, idx2word, word2idx): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx in xrange(len(sources)): # len(test_data_plain) test_s_o = sources[idx] source_text = keyphrase_utils.cut_zero(test_s_o, idx2word) text = pos_tagger.tag(source_text) print('[%d/%d] : %s' % (idx, len(sources), str(text))) tagged_source.append(text) return tagged_source
def get_postag_with_record(records, pairs): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain) print('*' * 100) print('File: ' + record['name']) print('Input: ' + str(pair[0])) text = pos_tagger.tag(pair[0]) print('[%d/%d][%d] : %s' % (idx, len(records), len(pair[0]), str(text))) tagged_source.append(text) return tagged_source
def get_word_dependencies(text): dependencies = {} dep_parser = StanfordDependencyParser( model_path=osp.join( datadir, "stanford_data/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ), java_options="-mx4g -XX:-UseGCOverheadLimit") st = StanfordPOSTagger(osp.join(datadir, "stanford_pos/stanford-postagger-3.9.1.jar"),\ osp.join(datadir, 'stanford_pos/models/english-bidirectional-distsim.tagger'), java_options='-mx4g, XX:-UseGCOverheadLimit') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st.stanford_jar = ':'.join(stanford_jars) result = dep_parser.raw_parse(text) dep = result.__next__() #print(list(dep.triples())) for i in list(dep.triples()): w1 = i[0][0] w2 = i[2][0] if w1 in dependencies: dependencies[w1].append((w2, i[1])) else: dependencies[w1] = [(w2, i[1])] #print(dependencies) return dependencies
def dependency_parser_nltk(word_lists, filename): os.environ['JAVAHOME'] = JAVA_PATH os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS chinese_parser = StanfordDependencyParser(model_path=nltk_parse_model_path) STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0] chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR)) chinese_parser.java_options = '-mx15000m' node_file = shelve.open(filename) all_dependency_list = [] for index, sentence in enumerate(word_lists): # 存进all_dependency_list中,存储数据类型是列表 res = list(chinese_parser.parse(sentence.strip().split())) print("we have finished ", index + 1, " sentence!!!") list_file = [triple for triple in res[0].triples()] all_dependency_list.append(list_file) #存进node_file,存储数据类型是dict/defaultdict,用作备份文件 node_dict = {} node = res[0].nodes for inner_index in range(len(node.items()) * 2): if node[inner_index]['word'] != None or node[inner_index][ 'ctag'] != None: # print(node[inner_index]) node_dict[node[inner_index]["address"]] = node[inner_index] # print(node[inner_index]["address"], type(node[inner_index]["address"])) node_file[str(index)] = node_dict node_file.close() return all_dependency_list
def tokernizer(self,tagger): stanford_dir = tagger._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) tagger._stanford_jar = ':'.join(stanford_jars) #tags = tagger.tag(self.stop_wrds()) tags = tagger.tag(word_tokenize(self.sentence)) return tags
def __init__(self, model='stanford/models/english-bidirectional-distsim.tagger', libpath='stanford/', verbose=False): self._model = model self._verbose = verbose self._libs = find_jars_within_path(libpath) self._xml_regex = re.compile( r' <word wid="[0-9]*" pos="([^"]*)" lemma="([^"]*)">(.*?)</word>') config_java(verbose=verbose)
def load_pos_tagger(stanford_base_dir): jar = stanford_base_dir + '/stanford-postagger.jar' model = stanford_base_dir + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model_filename=model, path_to_jar=jar) stanford_base_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_base_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) return pos_tagger
def load_pos_tagger(): path = os.path.dirname(__file__) path = os.path.join(file_dir[: file_dir.rfind('pykp') + 4], 'stanford-postagger') print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) return pos_tagger
def xxtest_StanfordPOSTagger(self): jar = '\\usr\\stanford-postagger-full-2015-12-09\\stanford-postagger.jar' model = '\\usr\\stanford-postagger-full-2015-12-09\\models\\english-left3words-distsim.tagger' tagger = StanfordPOSTagger(model, jar) stanford_dir = tagger._stanford_jar[0].rpartition('\\')[0] stanford_jars = find_jars_within_path(stanford_dir) tagger._stanford_jar = ':'.join(stanford_jars) text = tagger.tag(word_tokenize("What's the airspeed of an unladen swallow ?")) self.assertTrue(text is not None)
def __init__( self, path_to_jar=None, path_to_models_jar=None, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8', verbose=False, java_options='-mx4g', corenlp_options='', ): # find the most recent code and model jar stanford_jar = max( find_jar_iter( self._JAR, path_to_jar, env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ), key=lambda model_path: os.path.dirname(model_path), ) model_jar = max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True, ), key=lambda model_path: os.path.dirname(model_path), ) # self._classpath = (stanford_jar, model_jar) # Adding logging jar files to classpath stanford_dir = os.path.split(stanford_jar)[0] self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir)) self.model_path = model_path self._encoding = encoding self.corenlp_options = corenlp_options self.java_options = java_options
def get_pos_tag(sen):#pass sentence dataframe st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
def stanford_ne_tagger(tokens): st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) tags = st.tag(tokens) continuous_chunks = get_continuous_chunks(tags) named_entities_str_tag = set() for ne in continuous_chunks: if (ne[0][1] == u'LOCATION'): named_entities_str_tag.add( lower(u' '.join([token for token, tag in ne]))) return named_entities_str_tag
def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER',), searchpath=(), url=_stanford_url, verbose=verbose ) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items())
def get_pos_tag(sen): os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models' st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
def find_maltparser(parser_dirname): """ A module to find MaltParser .jar file and its dependencies. """ if os.path.exists(parser_dirname): # If a full path is given. _malt_dir = parser_dirname else: # Try to find path to maltparser directory in environment variables. _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',)) # Checks that that the found directory contains all the necessary .jar malt_dependencies = ['','',''] _malt_jars = set(find_jars_within_path(_malt_dir)) _jars = set(os.path.split(jar)[1] for jar in _malt_jars) malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar']) assert malt_dependencies.issubset(_jars) assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)) return list(_malt_jars)
def check_postag(config): train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file( config['dataset']) path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for dataset_name in config['testing_datasets']: # override the original test_set # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type']) test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config) test_set = test_sets[dataset_name] # print(dataset_name) # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']]))) test_data_plain = zip(*(test_set['source'], test_set['target'])) test_size = len(test_data_plain) # Alternatively to setting the CLASSPATH add the jar and model via their path: jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for idx in xrange(len(test_data_plain)): # len(test_data_plain) test_s_o, test_t_o = test_data_plain[idx] source = keyphrase_utils.cut_zero(test_s_o, idx2word) print(source) # Add other jars from Stanford directory stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) text = pos_tagger.tag(source) print(text)
def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose) # Adding logging jar files to classpath stanford_dir = os.path.split(self._stanford_jar)[0] self._stanford_jar = tuple(find_jars_within_path(stanford_dir)) self._encoding = encoding self.java_options = java_options
def parser_nltk(word_lists, filename): os.environ['JAVAHOME'] = JAVA_PATH os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS chinese_parser = StanfordParser(model_path=nltk_parse_model_path) STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0] chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR)) chinese_parser.java_options = '-mx15000m' all_parser_sentence = [] file = shelve.open(filename) flag = 0 for sentence in word_lists: if sentence.strip() != "": res = list(chinese_parser.parse((sentence.strip()).split())) new_str = return_str_tofile(sentence_parse=str(res[0])) file[str(flag)] = res all_parser_sentence.append(new_str) flag += 1 print("###### NLTK Dependency Parser Have finished " + str(flag) + " sentences ###") return all_parser_sentence
# -*- coding: utf-8 -*- # export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar from __future__ import unicode_literals import os import sys import io import copy import nltk from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser parser = StanfordParser( model_path= "stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) # from set_parser import parse_it class Node(object): """ A generic representation of a tree node. Includes a string label and a list of a children. """ def __init__(self, label): """ Creates a node with the given label. The label must be a string for use with the PQ-Gram algorithm. """ self.label = label self.children = list()
# -*- coding: utf-8 -*- # export CLASSPATH=$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar from __future__ import unicode_literals import os import sys import io import copy import nltk from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser parser=StanfordParser(model_path="stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) # from set_parser import parse_it class Node(object): """ A generic representation of a tree node. Includes a string label and a list of a children. """ def __init__(self, label): """ Creates a node with the given label. The label must be a string for use with the PQ-Gram algorithm. """ self.label = label self.children = list() def addkid(self, node, before=False): """ Adds a child node. When the before flag is true, the child node will be inserted at the beginning of the list of children, otherwise the child node is appended.
def preprocess(flist, folder_path): """ (file open for reading, str) -> Nonetype flist contains one filename per line and folder_path represents a directory. Do preprocessing on each file from flist in folder_path. """ error_log = [] for i in range(len(flist)): path = flist[i] stemmer = PorterStemmer() parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', verbose=True) stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) with open(path, 'r') as rf: try: sent = [line.strip('\n ') for line in rf] except UnicodeDecodeError as e: error_log.append('Unicode Decode Error:\t' + path + '\n') pass else: if not sent: error_log.append('Empty File Error:\t' + path + '\n') pass else: # Stemming with Porter Stemmer pars_stem = stemmer.stem(' '.join(sent)) stemmed = '\n'.join(sent) wf = open( folder_path + path.split('.')[0].split('/')[-1] + '.stem', 'w') wf.write(stemmed) wf.close() # POS Tagging after tokenizing and stemming pos = nltk.pos_tag(pars_stem.split()) wf = open( folder_path + path.split('.')[0].split('/')[-1] + '.pos', 'w') wf.write(str(pos)) wf.close() # CFG parser try: parsed = parser.raw_parse(pars_stem) except (TypeError, IndexError, NameError) as e: error_log.append('Unparsable Error:/t' + path + '/n') pass wf = open( folder_path + path.split('.')[0].split('/')[-1] + '.pars', 'w') s_pars = " ".join(str(x) for x in list(parsed)) s_pars = s_pars.replace("Tree", "") s_pars = s_pars.replace("[", "") s_pars = s_pars.replace("]", "") s_pars = s_pars.replace("\'", "") wf.write(s_pars) wf.close() # Print files paths with Errors if error_log: wf = open(folder_path + 'error_log', 'wb') for line in error_log: wf.write(line) wf.close()
def update_tagger_jars(tagger): stanford_dir = tagger._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) tagger._stanford_jar = ':'.join(stanford_jars) return tagger
def __init__(self, libpath='stanford/', verbose=False): self._verbose = verbose self._libs = find_jars_within_path(libpath) config_java(verbose=verbose)
#!/bin/env python3.5 from nltk.tag.stanford import StanfordNERTagger from nltk.internals import find_jars_within_path from nltk.tokenize import sent_tokenize import os tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar') tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09')) print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))
def __init__(self): self.parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') stanford_dir = self.parser._classpath[0].rpartition('/')[0] self.parser._classpath = tuple(find_jars_within_path(stanford_dir))
word_tf = [] for word in unique_terms: word_tf.append(collection.tf(word, document)) return word_tf stemmer = SnowballStemmer("english") wordnet_lemmatizer = WordNetLemmatizer() java_path = 'C:/Program Files (x86)/Java/jre1.8.0_101/bin/' os.environ['JAVA_HOME'] = java_path stanford_dir = 'C:/stanford-ner-2016-10-31/' jarfile = stanford_dir + 'stanford-ner.jar' modelfile = stanford_dir + 'classifiers/english.muc.7class.distsim.crf.ser.gz' st = StanfordNERTagger(modelfile, jarfile) stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ';'.join(stanford_jars) if __name__ == "__main__": folder = "Thomas_Baker" # Empty list to hold text documents. texts = [] listing = os.listdir(folder) for file in sorted(listing): if file.endswith(".txt"): url = folder + "/" + file f = open(url, encoding="latin-1") raw = f.read() f.close() tokens = nltk.word_tokenize(raw)
#中文词性标注 chi_tagger=StanfordPOSTagger(model_filename='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/models/chinese-distsim.tagger', path_to_jar='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/stanford-postagger.jar') print(chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。'.split())) #英文句法分析 #import os #java_path='/usr/lib/jvm/jdk/jdk1.8.0_121' #os.environ['JAVAHOME']=java_path from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser eng_parser=StanfordParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar', '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar', '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz') eng_parser.__classpath=tuple(find_jars_within_path('/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/')) print (list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))) #英文依存句法分析 from nltk.parse.stanford import StanfordDependencyParser eng_parser=StanfordDependencyParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar', '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar', '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz') res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) print (res[0]) for row in res[0].triples(): print(row)
#!/bin/env python3.5 #Author: Saurabh Pathak from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser from nltk.tokenize import sent_tokenize from nltk import download from nltk.tree import ParentedTree import os #download('punkt', quiet=True) #download('names', quiet=True) os.environ['CLASSPATH'] = os.getenv('CLASSPATH', '') + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser.jar:' + os.getcwd() + 'data/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') parser._classpath = find_jars_within_path(os.getcwd() + 'data/stanford-parser-full-2015-12-09') text = input('Enter some text:') tlist = [ParentedTree.fromstring(str(list(parsetree)[0])) for parsetree in parser.raw_parse_sents(sent_tokenize(text))] tlist2 = [tree.copy(True) for tree in tlist] from hobbs import * from lappinleasse import * print('Input text was:\n', text) def resolve(ls, algo): print('\nResolving with', algo) i = -1 for parsetree in ls: i += 1
from nltk.tag import StanfordNERTagger from nltk.tokenize import word_tokenize st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar') text = 'While in Frabce' tokenized_text = word_tokenize(text) #print tokenized_text #classified_text = st.tag(tokenized_text) #print(classified_text) import nltk from nltk.tag import StanfordNERTagger st = StanfordNERTagger('/home/ubuntu/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='/home/ubuntu/stanford-ner-2015-12-09/stanford-ner.jar') print st._stanford_jar stanford_dir = st._stanford_jar.rpartition('/')[0] from nltk.internals import find_jars_within_path stanford_jars = find_jars_within_path(stanford_dir) print ":".join(stanford_jars) st._stanford_jar = ':'.join(stanford_jars) print st._stanford_jar text = st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) print text