def get_postag_with_record(records, pairs): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain) print('*' * 100) print('File: ' + record['name']) print('Input: ' + str(pair[0])) text = pos_tagger.tag(pair[0]) print('[%d/%d][%d] : %s' % (idx, len(records), len(pair[0]), str(text))) tagged_source.append(text) return tagged_source
def get_postag_with_index(sources, idx2word, word2idx): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx in xrange(len(sources)): # len(test_data_plain) test_s_o = sources[idx] source_text = keyphrase_utils.cut_zero(test_s_o, idx2word) text = pos_tagger.tag(source_text) print('[%d/%d] : %s' % (idx, len(sources), str(text))) tagged_source.append(text) return tagged_source
def load_pos_tagger(stanford_base_dir): jar = stanford_base_dir + '/stanford-postagger.jar' model = stanford_base_dir + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model_filename=model, path_to_jar=jar) stanford_base_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_base_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) return pos_tagger
def load_pos_tagger(): path = os.path.dirname(__file__) path = os.path.join(file_dir[: file_dir.rfind('pykp') + 4], 'stanford-postagger') print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) return pos_tagger
def xxtest_StanfordPOSTagger(self): jar = '\\usr\\stanford-postagger-full-2015-12-09\\stanford-postagger.jar' model = '\\usr\\stanford-postagger-full-2015-12-09\\models\\english-left3words-distsim.tagger' tagger = StanfordPOSTagger(model, jar) stanford_dir = tagger._stanford_jar[0].rpartition('\\')[0] stanford_jars = find_jars_within_path(stanford_dir) tagger._stanford_jar = ':'.join(stanford_jars) text = tagger.tag(word_tokenize("What's the airspeed of an unladen swallow ?")) self.assertTrue(text is not None)
def get_pos_tag(sen):#pass sentence dataframe st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
def get_pos_tag(sen): os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models' st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
def check_postag(config): train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file( config['dataset']) path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for dataset_name in config['testing_datasets']: # override the original test_set # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type']) test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config) test_set = test_sets[dataset_name] # print(dataset_name) # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']]))) test_data_plain = zip(*(test_set['source'], test_set['target'])) test_size = len(test_data_plain) # Alternatively to setting the CLASSPATH add the jar and model via their path: jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) for idx in xrange(len(test_data_plain)): # len(test_data_plain) test_s_o, test_t_o = test_data_plain[idx] source = keyphrase_utils.cut_zero(test_s_o, idx2word) print(source) # Add other jars from Stanford directory stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) text = pos_tagger.tag(source) print(text)
from nltk.corpus import wordnet as wn import pandas as pd df = pd.DataFrame([]) ############################################################################### # Import Stanford Tagger ############################################################################### _stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml' jar = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-postagger-2015-12-09/stanford-postagger.jar' model = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-postagger-2015-12-09/models/english-left3words-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') stanford_dir = pos_tagger._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) ############################################################################### # Import Stanford Parser ############################################################################### _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' os.environ[ 'STANFORD_PARSER'] = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser.jar' os.environ[ 'STANFORD_MODELS'] = '/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar' os.putenv( "CLASSPATH", "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar" ) path_to_jar_p = "/Users/clairekelleher/Desktop/Thesis/Fromdesktop/stanford-parser-full-2017-06-09/stanford-parser.jar"
from nltk.internals import find_jars_within_path from nltk.tag import StanfordPOSTagger # ---- 1. SETUP ENVIRONMENT VARIABLES ---- sjar = '/Users/nischikata/PycharmProjects/stanford-postagger-full-2015-12-09/stanford-postagger.jar' model = '/Users/nischikata/PycharmProjects/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger' # ---- 2. CREATE POS TAGGER ---- POS_TAGGER = StanfordPOSTagger(model, sjar) # ---- 3. ADD OTHER JARS FROM STANFORD DIRECTORY ---- # yep, that should happen anyway if the CLASSPATH is set, but for some reason it doesn't - these 3 lines will do the job: stanford_dir = POS_TAGGER._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) POS_TAGGER._stanford_jar = ':'.join(stanford_jars)
# configure Stanford POS Tagger from nltk.tag import StanfordPOSTagger from nltk.internals import find_jars_within_path import platform stanford_pos_dir = 'resources/libs/stanford-postagger-2015-12-09/' eng_model_filename = stanford_pos_dir + 'models/english-left3words-distsim.tagger' my_path_to_jar = stanford_pos_dir + 'stanford-postagger.jar' tagger = StanfordPOSTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar) # https://gist.github.com/alvations/e1df0ba227e542955a8a # http://stackoverflow.com/questions/34361725/nltk-stanfordnertagger-noclassdeffounderror-org-slf4j-loggerfactory-in-windo stanford_jars = find_jars_within_path(stanford_pos_dir) separator = ';' if 'Windows' in platform.platform() else ':' tagger._stanford_jar = separator.join(stanford_jars) # End configuration class WLLR(object): def __init__(self, documents): super(WLLR, self).__init__() self.__documents = documents self.__set_contrast_indicator = set_fore_contrast_indicator | set_post_contrast_indicator print 'Initialize document info...' document_info = self.__init_document_info(documents) print 'Initialize dictionary...' self.__dictionary = {} self.__init_dictionary(document_info)