def get_pos_tags(content, stopwords, is_stemming, is_math): # Content should be tokenized pos_tagger_dir = '/usr/users/swli/program/nlp_util/stanford-postagger' model = pos_tagger_dir + '/models/wsj-0-18-bidirectional-distsim.tagger' classpath = pos_tagger_dir + '/stanford-postagger_with_slf4j.jar' tagger = StanfordPOSTagger(model, classpath, java_options='-mx4000m') try: tag_results = tagger.tag(re.split('\s+', content)) except OSError: sentences = re.split('\s+\.\s+', content) tag_results = [] for index in range(len(sentences)): sentence = sentences[index] if index < len(sentences)-1: sentence += ' .' tag_results += get_contaminated_tag_results(sentence, tagger) pos_tags = [] for pair in tag_results: word = pair[0] # map simple equation to tokens if is_math: word = simple_eq_to_text(word) # remove punctuation word = "".join(l for l in word if l not in string.punctuation) word = word.lower() word = process_word(word, stopwords, is_stemming, is_math) if word: pos_tags.append(pair[1]) return pos_tags
def genere_liste_natures(l_auteurs, STANFORD_PARSER='../stanford', STANFORD_MODELS='../stanford', JAVAHOME='/import/lhauseux/jre1.8.0_45/bin', bid = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/models/english-bidirectional-distsim.tagger', pt = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/stanford-postagger.jar'): # On règle les paramètres du prog' de Stangord et de java : os.environ['STANFORD_PARSER'] = STANFORD_PARSER os.environ['STANFORD_MODELS'] = STANFORD_MODELS os.environ['JAVAHOME'] = JAVAHOME st = StanfordPOSTagger(bid,path_to_jar=pt,java_options='-mx15000m') nltk.internals.config_java(options='-xmx2G') # On crée le dossier forets si nécessaire if not os.path.isdir('./listes_natures'): os.mkdir('./listes_natures') for auteur in l_auteurs: # On crée le dossier propre à l'auteur si nécessaire if not os.path.isdir('./listes_natures/'+auteur): os.mkdir('./listes_natures/'+auteur) articles = os.listdir('./auteurs/'+auteur) for article in articles: if article != 'liens.txt': # On récupère l'article au format texte f = open('./auteurs/'+auteur+'/'+article,'r') contenu = f.read() f.close() # On le transforme en forêts d'arbres contenu = nltk.word_tokenize(contenu) contenu = st.tag(contenu) contenu = [c[1] for c in contenu] # On enregistre f = open('./listes_natures/'+auteur+'/'+article,'wb') pickle.dump(contenu,f) f.close() print(auteur,article)
def postag_sents(sents): if not os.environ.get('STANFORD_MODELS'): os.environ["STANFORD_MODELS"] = STANFORD_MODELS st = StanfordPOSTagger('arabic.tagger', STANFORD_POSTAGGER + '/combined.jar') tagged_sents = st.tag_sents(sents) tagged_sents = [[tuple(t[1].split('/')) for t in sent] for sent in tagged_sents] return tagged_sents
def postagger(): os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/stanford-postagger.jar' os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH'] eng_tagger = StanfordPOSTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/models/english-bidirectional-distsim.tagger') for x in content: print(eng_tagger.tag(x.split()))
def read_sstb_data(fpath='sstb/sstb_condensed_{}.csv'): revs = [] vocab = {} pos_vocab = {} max_len = 0 pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') dataset_split = ['train', 'test', 'dev'] for split in dataset_split: with open(fpath.format(split), "rb") as f: rdr = csv.reader(f) tokens_list = [] labels = [] # read all the lines for row in rdr: tokens = clean_str(row[0]).split() tokens_list.append(tokens) labels.append(row[1]) # pos tagging tokens_list_tagged = pos_tagger.tag_sents(tokens_list) for i in range(len(tokens_list_tagged)): tokens_tagged = tokens_list_tagged[i] label = labels[i] text_tokens = list(zip(*tokens_tagged)[0]) tag_tokens = list(zip(*tokens_tagged)[1]) # add each token to vocab for token in text_tokens: if token not in vocab: vocab[token] = len(vocab) for tag in tag_tokens: if tag not in pos_vocab: pos_vocab[tag] = len(pos_vocab) # get max len max_len = max(max_len, len(text_tokens)) # create an entry for the current rev and add to the list curr_rev = {'text_tokens': text_tokens, 'tag_tokens': tag_tokens, 'label': conv_label_to_label_vec(label), 'fold_num': get_fold_num(split)} revs.append(curr_rev) # add padding word vocab[PAD_WORD] = len(vocab) pos_vocab[PAD_WORD] = len(pos_vocab) return revs, vocab, pos_vocab, max_len
def read_mr_data(num_folds, fpath='mr/rt-polarity.{}'): revs = [] vocab = {} pos_vocab = {} max_len = 0 pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') sentiments = ['pos', 'neg'] for sentiment in sentiments: with open(fpath.format(sentiment), "rb") as f: tokens_list = [] label_vec = conv_sent_to_vec(sentiment) # read all the lines for line in f.read().splitlines(): tokens = clean_str(line).split() tokens_list.append(tokens) # pos tagging tokens_list_tagged = pos_tagger.tag_sents(tokens_list) for tokens_tagged in tokens_list_tagged: text_tokens = list(zip(*tokens_tagged)[0]) tag_tokens = list(zip(*tokens_tagged)[1]) # add each token to vocab for token in text_tokens: if token not in vocab: vocab[token] = len(vocab) for tag in tag_tokens: if tag not in pos_vocab: pos_vocab[tag] = len(pos_vocab) # get max len max_len = max(max_len, len(text_tokens)) # create an entry for the current rev and add to the list curr_rev = {'text_tokens': text_tokens, 'tag_tokens': tag_tokens, 'label': label_vec, 'fold_num': np.random.randint(0, num_folds)} revs.append(curr_rev) # add padding word vocab[PAD_WORD] = len(vocab) pos_vocab[PAD_WORD] = len(pos_vocab) return revs, vocab, pos_vocab, max_len
def pos_tagging(sentence): english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar') VP_list=[] POS_list=english_postagger.tag(sentence.split()) '''for i in range(0, len(POS_list)): if POS_list[i][1] in ['NNS','NNP','NNPS']: NP_list.append(POS_list[i][0])''' return POS_list
def get_pos_tag(sen):#pass sentence dataframe st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
class Lemmatizer(AbstractStemmer): def __init__(self, ): super(Lemmatizer, self).__init__() self.basename = 'lemmatized' self.pos_tagger = StanfordPOSTagger('english-left3words-distsim.tagger', java_options='-mx1024m') self.lemmatizer = WordNetLemmatizer() self.max_length = 500 def process(self, words): current_sentence = [] pos_words = [] for word in words: current_sentence.append(word) if word in '.!?' and len(current_sentence) > self.max_length: try: pos_words += self.pos_tagger.tag(current_sentence) except Exception: print 'Broke on', current_sentence raise current_sentence = [] for i in range(len(current_sentence) / self.max_length): try: pos_words += self.pos_tagger.tag(current_sentence[:self.max_length]) except Exception: print 'Broke on', current_sentence[:self.max_length] raise current_sentence = current_sentence[self.max_length:] try: pos_words += self.pos_tagger.tag(current_sentence) except Exception: print 'Broke on', current_sentence raise processed_words = [self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps)) for wd, ps in pos_words] return processed_words # from http://stackoverflow.com/questions/15586721 def get_wn_pos(self, treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN
def __init__(self, tagger_path, model_path, output_filename): self.st = StanfordPOSTagger(tagger_path, model_path) self.output_filename = output_filename try: os.remove(self.output_filename) except OSError: pass
def pos_person_tagging(sentence): #Setting the path and jar files for the POS Tagger english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar') NP_list=[] POS_list=english_postagger.tag(sentence.split()) for i in range(0, len(POS_list)): if POS_list[i][1] in ['NNS','NNP','NNPS']: NP_list.append(POS_list[i][0]) return NP_list
def main(): initialize() # create tagger model = '../stanford-postagger/models/chinese-distsim.tagger' jar = '../stanford-postagger/stanford-postagger.jar' zhPOS = StanfordPOSTagger(model, jar) # streaming model: process each line in turn with io.open(INFILE, 'r', encoding='utf8') as qts, io.open(OUTFILE, 'w', encoding='utf8') as pos: for line in qts: qtsPOS = zhPOS.tag(line) s = " ".join("%s" % tup[1] for tup in qtsPOS) + "\n" pos.write(s) return()
def get_pos_tag(sen): os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models' st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
class FeatureProcessing(object): def __init__(self): self.feat_index = {} self.implication_words = ["demonstrate", "suggest", "indicate"] self.hyp_words = ["possible"] self.method_words = ["probe", "detect"] self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') def get_features(self, phrase, filter_feature='0'): words = word_tokenize(phrase) pos_tags = self.pos_tagger.tag(words) features = [] for word, tag in pos_tags: wl = word.lower() # Feat 1: POS features if filter_feature != '1': if tag != ',' and tag != '.': features.append(tag) # Feat 2: Verb and adverb identity if filter_feature != '2': if tag == 'RB' or tag.startswith('VB'): features.append(wl) # Feat 3: Presence of figure references and citations if filter_feature != '3': if word.startswith("Fig"): features.append("figure") if re.search("[A-Z][^\s]+ et al.", phrase) is not None: features.append("reference") # Feat 4: Presence of specific words or phrases if filter_feature != '4': if re.search("[Dd]ata not shown", phrase) is not None: features.append("data_not_shown") for word in self.implication_words: if word in phrase: features.append("implication_word") for word in self.hyp_words: if word in phrase: features.append("hyp_word") for word in self.method_words: if word in phrase: features.append("method_word") return features def index_data(self, data, filter_feature='0'): all_features = [self.get_features(datum, filter_feature) for datum in data] for features in all_features: for feat in features: if feat not in self.feat_index: self.feat_index[feat] = len(self.feat_index) def featurize(self, phrase, filter_feature='0'): indexed_features = [0] * len(self.feat_index) features = self.get_features(phrase, filter_feature) for feat in features: if feat not in self.feat_index: continue indexed_features[self.feat_index[feat]] += 1 return indexed_features
def build_data_cv(data_file): revs = [] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') split_list = ['train', 'test'] class_to_label = {} for split in split_list: with open(data_file.format(split), "rb") as f: revs_text = [] ys = [] for line in f: qclass, rev = line.split(':')[0], line.split(':')[1] rev = clean_str(rev) if qclass not in class_to_label: class_to_label[qclass] = len(class_to_label) y = class_to_label[qclass] else: y = class_to_label[qclass] revs_text.append(rev.split()) ys.append(y) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0])[1:] tag = list(zip(*rev_tagged)[1]) y = ys[i] for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 datum = {"y": y, "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": 0 if split == 'train' else 1} revs.append(datum) return revs, vocab, pos_vocab, len(class_to_label)
def build_data_cv(data_file): revs = [] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx6000m') splits = ['train', 'test', 'dev'] for split in splits: with open(data_file.format(split), "rb") as f: lines = f.read().splitlines() revs_text = [] ratings = [] for line in lines: line_split = line.split('\t\t') rating = int(line_split[2]) - 1 rev = line_split[3] rev_tokens = rev.split() revs_text.append(rev_tokens) ratings.append(rating) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 rev_datum = {"y": ratings[i], "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": get_split_num(split)} revs.append(rev_datum) return revs, vocab, pos_vocab
class POSTagger: def __init__(self, tagger_path, model_path, output_filename): self.st = StanfordPOSTagger(tagger_path, model_path) self.output_filename = output_filename try: os.remove(self.output_filename) except OSError: pass def output_knowledge(self, sentence): sentence += " ." s = "" with open(self.output_filename, "a") as file: for word, pos_tag in self.st.tag(sentence.split()): file.write(("%s\t%s\n" % (word, pos_tag)).encode("utf-8")) file.write("\n")
# from nltk.tag import StanfordNERTagger # eng_tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # print eng_tagger.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # # # 中文命名实体识别 # chi_tagger = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz') # sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' # for word, tag in chi_tagger.tag(sent.split()): # print word.encode('utf-8'), tag # # # 英文词性标注 from nltk.tag import StanfordPOSTagger # eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') # print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # # 中文词性标注 chi_tagger = StanfordPOSTagger('chinese-distsim.tagger') # sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续' for _, word_and_tag in chi_tagger.tag(sent.split()): word, tag = word_and_tag.split('#') print word.encode('utf-8'), tag # 中英文句法分析 区别在于词库不同 from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split()) for tree in eng_parser.parse(sent): tree.pprint()
# -*- coding:utf-8 -*- import os import json import re import nltk from flask import Flask, render_template, request # importation des class de flask from nltk.tag import StanfordPOSTagger app = Flask(__name__) os.system("clear") root_path = "/home/e20160010106/projet teR NOUVEAU/StanfordTagguer/" # instance de la classe StanfordPOSTagger en UTF-8 pos_tagger = StanfordPOSTagger(root_path + "/models/french.tagger", root_path + "/stanford-postagger.jar", encoding='utf8') def pos_tag(sentence): # je transforme la phrase en tokens => si vous avez un texte avec plusieurs phrases, passez d'abord par nltk pour récupérer les phrases tokens = nltk.word_tokenize(sentence) tags = pos_tagger.tag(tokens) # lance le tagging print(tags) return tags @app.route('/') # route qui a pour chemin / vers la page d'accueil def recherche_1(): print('/recherche_1') return render_template(
# -*- coding:utf-8 -*- from nltk.tag import StanfordPOSTagger jar = 'C:/Users/Shijie Xu/Documents/NLTK/stanford-postagger-full-2012-01-06/stanford-postagger-2012-01-06.jar' model = 'C:/Users/Shijie Xu/Documents/NLTK/stanford-postagger-full-2012-01-06/models/french.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding="utf8") res = pos_tagger.tag( 'Si toutes ces excuses ne suffisent pas, je veux bien dédier ce livre à l’enfant qu’a été autrefois cette grande personne.' .split()) """tag_abbreviations = { 'A': 'adjective', 'Adv': 'adverb', 'CC': 'coordinating conjunction', 'Cl': 'weak clitic pronoun', 'CS': 'subordinating conjunction', 'D': 'determiner', 'ET': 'foreign word', 'I': 'interjection', 'NC': 'common noun', 'NP': 'proper noun', 'P': 'preposition', 'PREF': 'prefix', 'PRO': 'strong pronoun', 'V': 'verb', 'PONCT': 'punctuation mark', 'N': 'noun'}""" print(res)
def Process_ZH(File): # Read file with open(File, 'r') as File: # print 'Opened' Input = File.readlines() try: for line in Input: # Checks if identifier is in the line if 'segment' in line: #Sets counter to be on and starts count at 0 Annotations = [] Annotation_Next = False Line = [] Word_Count = 0 # Switch to UTF-8 to ensure accurate counting Line_UTF8_Decode = line.decode('utf-8') Line_Split = Line_UTF8_Decode.split() for Split in Line_Split: if 'feature' in Split: Annotations.append([Split[17:-1], Word_Count, 0]) # print Annotations elif 'state=' in Split: Line_Temp = re.findall('>([^>]*)</', Split) # print 'State_1' if Line_Temp != []: # Ensures that Line_Temp is a string Line_Temp = Line_Temp[0] Word_Count += 1 # print 'State_2' # To ensure nested entities are parsed correctly if Annotations[-1][2] != 0: Annotation_Next = True Length = range(len(Annotations)) for x in Length[::-1]: if Annotations[x][2] == 0 and Annotation_Next == True: Annotations[x][2] = Word_Count Annotation_Next = False else: Annotations[-1][2] = Word_Count # print Annotations elif Line_Temp == [] and '<segment' in Split[15:]: pass else: Word_Count += 1 Line_Temp = Split[15:] # print 'State 3' if Line_Temp != []: Line.append(Line_Temp) elif '</segment>' in Split: Seg_Split = Split.split('</segment>') for x in Seg_Split: if x != '': Word_Count += 1 Line.append(x) elif x == '': if Annotations[-1][2] != 0: # print 'Seg 2' Annotation_Next = True Length = range(len(Annotations)) for x in Length[::-1]: # print Annotations[x][2] if Annotations[x][2] == 0 and Annotation_Next == True: # print 'Seg 3' Annotations[x][2] = Word_Count Annotation_Next = False else: Annotations[-1][2] = Word_Count # print Annotations # if '<' not in Split[0]: # Word_Count += 1 # print Split # Line_Temp = Split[:-10] # print Line_Temp # Line.append(Line_Temp) # # print 'Seg_1' # if Annotations[-1][2] != 0: # # print 'Seg 2' # Annotation_Next = True # Length = range(len(Annotations)) # for x in Length[::-1]: # # print Annotations[x][2] # if Annotations[x][2] == 0 and Annotation_Next == True: # # print 'Seg 3' # Annotations[x][2] = Word_Count # Annotation_Next = False # else: # Annotations[-1][2] = Word_Count # # print Annotations elif '<segment' not in Split: # print Split Line.append(Split) # Checks if Split is a punctuation character if re.findall('[%s]' % zhon.hanzi.punctuation, Split) == [] and Split != ':': Word_Count += 1 Line_Done = ' '.join(Line) # Tags using StanfordPOSTagger ST = StanfordPOSTagger('~/Annotations/models/chinese-distsim.tagger', '~/Annotations/stanford-postagger.jar', encoding='utf-8') Tags = ST.tag(Line) Tags_Done = '' for x in Tags: # print x Tags_Done += x[1][-2:] + ' ' # print Line_Done # print Tags_Done Annotations_Done = '' for x in Annotations: Annotations_Done += str(x[1]) + ',' + str(x[2]) + ',' + str(x[1]) + ',' + str(x[2]) + ' ' + x[0].upper() + '|' # print Annotations_Done with open('Processed_Annotations.txt', 'a') as P_A: P_A.write(Line_Done.encode('utf-8') + '\n') P_A.write(Tags_Done + '\n') P_A.write(Annotations_Done[:-1] + '\n' + '\n') except IndexError: pass
#将976个问题切词 import re import xlwt import xlrd import jieba from nltk.stem import WordNetLemmatizer from nltk.tag import StanfordPOSTagger modelfilename = 'C:\\Users\\asus\\AppData\\Roaming\\nltk_data\\stanfordpostagger\\models\\english-bidirectional-distsim.tagger' pathtojar = 'C:\\Users\\asus\\AppData\\Roaming\\nltk_data\\stanfordpostagger\\stanford-postagger.jar' eng_tagger = StanfordPOSTagger(model_filename=modelfilename, path_to_jar=pathtojar) lemmatizer = WordNetLemmatizer() #读取excel中某一列的数据,形成列表 def exceltolist(path, sheet, col): col_values = [] data = xlrd.open_workbook(path) table = data.sheet_by_name(sheet) col_values = table.col_values(col) #print(len(col_values)) return col_values ques = exceltolist('C:\\Users\\asus\\Desktop\\测试\\测试问题.xlsx', 'Sheet2', 0) #print(ques) # 读取同义词表:并生成一个字典。 combine_dict = {} for line in open(
# -*- coding: utf-8 -*- from nltk.tag import StanfordPOSTagger import utility as util import sys import os import re import properties reload(sys) sys.setdefaultencoding('utf-8') tagger = properties.tagger taggerJARPath = properties.taggerJARPath chunkFile = properties.chunkTestFile st = StanfordPOSTagger(tagger,taggerJARPath) def extractChunkTags(): result = util.extract_All_Tags(chunkFile) chunktags = [] for each in result: if '$' in each: continue temp1 = each.split("\t") temp2 = temp1[2].split() if len(temp2) == 1: temp2.append('XX') temp3 = [] temp3.append(temp1[0]) temp3.append(temp1[1]) temp3.append(temp2[0]) temp3.append(temp2[1])
# -*- coding: utf-8 -*- """ Created on Thu Jan 11 13:38:34 2018 @author: Etudiant """ ###################################################################################### from nltk.tag import StanfordPOSTagger jar = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar' model = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/models/french.tagger' import os java_path = "C:/Program Files/Java/jdk1.8.0_151/bin/java.exe" os.environ['JAVAHOME'] = java_path pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') words={} tab2 = {} for i in range(5): select=[] n=pos_tagger.tag(tab[i]) stops_verb=['NC','N','NPP'] for x in n: if x[1] in stops_verb: select.append(x[0]) #sel = max(set(select), key=select.count) #tab2[i]=sel words={} for word in set(select): count = 0
from opencc import OpenCC from nltk.parse.stanford import StanfordParser from nltk.tokenize import StanfordSegmenter from nltk.tag import StanfordNERTagger from nltk.tag import StanfordPOSTagger import pickle, re, pymysql, jieba, os import pandas as pd chi_tagger = StanfordPOSTagger('./StanfordNLP/models/chinese-distsim.tagger', './StanfordNLP/jars/stanford-ner.jar') segmenter = StanfordSegmenter( java_class='edu.stanford.nlp.ie.crf.CRFClassifier', path_to_jar="./StanfordNLP/jars/stanford-segmenter-3.9.2.jar", path_to_slf4j="./StanfordNLP/jars/slf4j-api.jar", path_to_sihan_corpora_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data", path_to_model="./StanfordNLP/stanford-segmenter-2018-10-16/data/pku.gz", path_to_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz" ) os.environ["JAVA_HOME"] = "/tmp2/b05902109/jdk-12.0.1"#注意這邊你們電腦要安裝java jdk,並放入你們自己的jdk的安裝路徑 os.environ["CLASSPATH"] = "./StanfordNLP/stanford-parser-2018-10-17" os.environ["STANFORD_MODELS"] = "./StanfordNLP/models" ch_parser = StanfordParser(model_path='./StanfordNLP/models/chinesePCFG.ser.gz') cc = OpenCC('t2s') # (Optional )convert from Simplified Chinese to Traditional Chinese def Get_Data_From_Mysql(source_name,keyword): contents_list=[] target_ids =[] db = pymysql.connect(host="18.217.252.187",port=3306, user="******",passwd="antimoneylaunderingisgood2",db="AML_News" ,charset='utf8') try:
from collections import Counter import multiprocessing import sys from __future__ import print_function stop_words_list = sw.words('english') punctus = ':-.,?;!\'\"' startwithupper_r = re.compile(r"[A-Z].*") containupper_r = re.compile(r".*[A-Z].*") java_path = r"/home/tmpuser/jre/bin/java" os.environ['JAVAHOME'] = java_path os.environ['JAVA_HOME'] = java_path tagger_pos = StanfordPOSTagger( r'/home/tmpuser/stanford-postagger-2017-06-09/' + r'models/english-bidirectional-distsim.tagger', r'/home/tmpuser/stanford-postagger-2017-06-09/' + r'stanford-postagger.jar', java_options='-mx90000m') pos_tag_list = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '#', '$', "''", '(', ')', ',', '.', ':', '``', 'e', 'di', 'eff', 'anti-infl', 'classifi', '-' ] pos_tag_dict = {key: value for value, key in enumerate(pos_tag_list)} pos_tag_len = len(pos_tag_list) root_dir = 'description/%s/' % sys.argv[1] feature_folder = 'stylometry_features/%s/' % sys.argv[1]
new_sen.append(stem_word) print " ".join(new_sen) # stanford version import nltk from nltk.tag import StanfordPOSTagger from nltk.tokenize import StanfordTokenizer from nltk.stem import WordNetLemmatizer from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english') wordnet_lemmatizer = WordNetLemmatizer() tokenizer = StanfordTokenizer() eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') text="Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a change.,They got themselves and Dan on a diet.".split(',') for sen in text: token_list=tokenizer.tokenize(sen[:-1]) tagged_sen=eng_tagger.tag(token_list) new_sen=[] for (word,tag) in tagged_sen: # print word,tag if tag[0]=='V': lemma_word=wordnet_lemmatizer.lemmatize(word,pos='v') else:
''' Created on 26 Mar 2018 @author: Owner ''' import pandas as pd from nltk.tag import StanfordPOSTagger from nltk.corpus import stopwords import nltk from textblob import TextBlob from pandas import DataFrame from pandas import ExcelWriter st = StanfordPOSTagger( '$/Multisimo/stanford-postagger-2018-02-27/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger', path_to_jar= '$/Multisimo/stanford-postagger-2018-02-27/stanford-postagger-2018-02-27/stanford-postagger.jar' ) df_proc = pd.read_excel( '$/Multisimo/Multisimo_Data/Processed_transcripts.xlsx') transcripts = df_proc['test_set'].tolist() words = set(nltk.corpus.words.words()) eng_stopwords = set(stopwords.words('english')) sentiment_list = [] nouns_list = [] #NN* verbs_list = [] #VB* personal_pronoun_list = [] #PRP possessive_pronoun_list = [] #PRP$ wh_list = [] #W*
import nltk import math import numpy as np from nltk.corpus import wordnet from nltk.tag import StanfordPOSTagger ###########################################ICANTUSEPOSTAGGER #from nltk.tag.stanford import POSTagger ############# #Change path# ############# java_path ="/usr/bin/java.exe" ###########################################ICANTUSEPOSTAGGER #st=POSTagger('C:\\Users\\CGR\\Desktop\\stanford-postagger-2015-04-20\\stanford-postagger-2015-04-20\\models\\english-bidirectional-distsim.tagger','C:\\Users\\CGR\\Desktop\\stanford-postagger-2015-04-20\\stanford-postagger-2015-04-20\\stanford-postagger.jar',encoding='UTF-8') st = StanfordPOSTagger('/home/jsrang02/english-bidirectional-distsim.tagger', '/home/jsrang02/stanford-postagger.jar') os.environ['JAVAHOME'] = java_path print("Please Input RTE Set Number:") key=input() ############# #Change path# ############# if key=="1": tree = parse("/home/jsrang02/RTE/dev/dev.xml") elif key=="2": tree = parse("/home/jsrang02/RTE/dev/dev2.xml") elif key=="3": tree = parse("/home/jsrang02/RTE/dev/dev3.xml") root = tree.getroot() true_similarity=[]
def number(sentence): pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' ) tagged_sentence= pos_tagger.tag(sentence.split()) numbers = [word for word,tag in tagged_sentence if tag == 'DET' and det_or_nb(word)=='nb'] return(' '.join(numbers))
__author__ = 'Anirudh' import codecs import nltk from nltk.tag import StanfordPOSTagger nltk.internals.config_java("C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe") import os java_path = "C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe" os.environ['JAVAHOME'] = java_path # st = StanfordPOSTagger('english-bidirectional-distsim.tagger') st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\arabic.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar') #st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\english-bidirectional-distsim.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar') file="arabic_in.txt" source = codecs.open(file,"r","utf-16-be") destination = codecs.open("utf8encoder_out.txt","wb","utf-8") contents=source.read() destination.write(contents) destination = codecs.open("utf8encoder_out.txt","r","utf-8") contents2=destination.read() print contents2.split() print st.tag(contents2.split())
import pickle with open('./all_seqs.pkl', 'rb') as fh: train, valid, test = pickle.load(fh) with open('./nli_tokenizer.pkl', 'rb') as fh: tokenizer = pickle.load(fh) dict = {w: i for (w, i) in tokenizer.word_index.items()} inv_dict = {i: w for (w, i) in dict.items()} word_candidate = {} trains = [t[1:-1] for t in train['s2']] tests = [t[1:-1] for t in test['s2']] from nltk.tag import StanfordPOSTagger jar = 'stanford-postagger-2018-10-16/stanford-postagger.jar' model = 'stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') #from nltk.stem import WordNetLemmatizer #wnl = WordNetLemmatizer() #pos_tag=pos_tagger.tag(['what', "'s", 'invigorating', 'about', 'it', 'is', 'that', 'it', 'does', "n't", 'give', 'a', 'damn']) #print(pos_tag) train_text = [[inv_dict[t] for t in tt] for tt in trains] all_pos_tags = [] for text in train_text: pos_tags = pos_tagger.tag(text) all_pos_tags.append(pos_tags) f = open('pos_tags.pkl', 'wb') pickle.dump(all_pos_tags, f)
def __init__(self, ): super(Lemmatizer, self).__init__() self.basename = 'lemmatized' self.pos_tagger = StanfordPOSTagger('english-left3words-distsim.tagger', java_options='-mx1024m') self.lemmatizer = WordNetLemmatizer() self.max_length = 500
import re from nltk.parse.stanford import StanfordDependencyParser path_to_jar = r'C:\Users\Lovisa\Downloads\stanford-corenlp-full-2016-10-31\stanford-corenlp-full-2016-10-31\stanford-corenlp-3.7.0.jar' path_to_models_jar = r'C:\Users\Lovisa\Downloads\stanford-corenlp-full-2016-10-31\stanford-corenlp-full-2016-10-31\stanford-corenlp-3.7.0-models.jar' dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) from nltk.tag import StanfordPOSTagger st = StanfordPOSTagger('english-bidirectional-distsim.tagger') from nltk import word_tokenize from nltk.stem import WordNetLemmatizer wl = WordNetLemmatizer() def read_ingredients(filename, ing_list, new_ingredients): ingredients = {} section = None with open('baking_recipes/' + filename, 'r') as f: for line in f: if "INGREDIENTS" in line: continue if "PREPARATION" in line: break ##Ignore any blank lines line = line.strip() if not line: continue if line.endswith( ":"): #structural assumption: sections end with colon section = line
from nltk.tag import StanfordPOSTagger from nltk.tokenize import word_tokenize MODEL_PATH = '/Users/yunhongmin/stanford-postagger-full/models/english-bidirectional-distsim.tagger' JAR_PATH = '/Users/yunhongmin/stanford-postagger-full/stanford-postagger-3.9.2.jar' pos_tagger = StanfordPOSTagger(MODEL_PATH, JAR_PATH) text = 'If you unpack the tar file, you should have everything needed. This software provides a GUI demo, a command-line interface, and an API. Simple scripts are included to invoke the tagger. For more information on use, see the included README.txt.' tokens = word_tokenize(text) print(tokens) print() print(pos_tagger.tag(tokens))
# -*- coding: utf-8 -*- from sys import argv from nltk.tag import StanfordNERTagger from nltk.tag import StanfordPOSTagger from nltk.tokenize import sent_tokenize import re import sys import logging reload(sys) sys.setdefaultencoding('utf8') script, filename, loadname = argv logging.basicConfig(format='preprocess progress:%(message)s', level=logging.INFO) NERTagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') POSTagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') PRPList = ["He", "he", "She", "she", "His", "his", "Her", "him", "her", "him,", "him.", "her,", "her."] monthElement = "january|february|march|april|may|june|july|august|september|october|november|december" dateElement = "1|2|3|4|5|6|7|8|9|0" monthPattern = re.compile(monthElement, re.IGNORECASE) datePattern = re.compile(dateElement, re.IGNORECASE) #month: return 1. year for sure: return 2:. correct date: 3. not date 0. def dateJudge(datePair): dateString = datePair[0] dateTagger = datePair[1] if dateTagger == "CD": matchDate = re.findall(datePattern, dateString) if len(matchDate) == len(dateString): if int(dateString) > 31: return 2 else:
""" Handles POS tagging""" from nltk.tag import StanfordPOSTagger from lib.utils import POS_TAGGERS_JAR_PATH, POS_TAGGERS_EN_MODEL_PATH, POS_TAGGERS_DE_MODEL_PATH POS_TAGGER_EN = StanfordPOSTagger(POS_TAGGERS_EN_MODEL_PATH, POS_TAGGERS_JAR_PATH) POS_TAGGER_DE = StanfordPOSTagger(POS_TAGGERS_DE_MODEL_PATH, POS_TAGGERS_JAR_PATH) def get_pos_tags(tokenised_sentence, lang): """ Given a tokenised sentence, returns sentence as array of tuples of (token, POS Tag) """ if lang == 'en': pos_tagger = POS_TAGGER_EN elif lang == 'de': pos_tagger = POS_TAGGER_DE else: raise NotImplementedError('Pass in either en or de as language') return pos_tagger.tag(tokenised_sentence)
import frecuencias, filtros, spacy from frecuencias import * from filtros import * from nltk import word_tokenize, sent_tokenize, bigrams, trigrams from nltk.tag import StanfordPOSTagger from docx import Document #cd Documents\Repos\investigacion\source\ #Configuracion inicial del POSTagger de Stanford (verificar que la ruta sea la correcta, sino no funciona) tagger = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\models\spanish.tagger' jar = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\stanford-postagger.jar' etiquetador = StanfordPOSTagger(tagger, jar) #Abro el documento y extraigo las palabras texto = "" #Lista de palabras del texto, incluyendo stopwords f = open( r'C:\Users\lau_9\Documents\Repos\investigacion\source\data\TextoEjemplo.docx', 'rb') document = Document(f) for i in document.paragraphs: texto += i.text #.lower() f.close() #Tokenizo y reformateo el texto, eliminando stopwords #texto_ref = ' '.join(reformatearTexto(texto)) #palabras_ref = borrarStopwords(word_tokenize(texto_ref)) #print("TEXTO REFORMATEADO:\n" + texto.upper() + "\n") #print("PALABRAS TOKENIZADAS Y SIN ETIQUETAR:\n" + str(palabras).upper() + "\n") #Etiqueto a los tokens y muestro sus etiquetas #palabras_etiquetadas = etiquetador.tag(palabras)
for path in paths: path_length = len(re.sub('/?[\[{].*?[\]}]', '', path).split('/')) - 1 # if path starts with version info, length should minus 1 if re.match('^/[v]?[0-9]', path): path_length -= 1 path_lengths.append(path_length) methods = [method for path in paths for method in swagger['paths'][path]] # frequency statistic fd_method = nltk.FreqDist(methods) fd_path_length = nltk.FreqDist(path_lengths) # part-of-speech judgement home_path = '/home/pyx/Desktop/stanford-postagger-full-2015-12-09' # home_path = '/Users/kenmick/Desktop/stanford-postagger-full-2015-12-09' st = StanfordPOSTagger(home_path + '/models/english-left3words-distsim.tagger', home_path + '/stanford-postagger.jar') url_noun = [] url_not_noun = [] pos = ['NN', 'NNS', 'IN', 'JJ', 'JJS', 'RB', 'TO', 'PRP', 'PRP$', 'NNP', 'NNPS', 'DT', 'VBG', 'VBN', 'VBD'] count = 1 for path in paths: print str(count) + '/' + str(len(paths)) count += 1 isNoun = True print path # remove parameters in path, such as {id}, [id], :id, and split url by level, namely by '/' urls = re.sub('/?[\[{].*?[\]}]|/:\w+', '', path).replace('.json', '').lstrip('/').split('/') for url in urls: for word_pos in st.tag(get_divided_url(url)):
with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt', 'r') as f: first_line = f.readline() user_input = first_line.split(' ', 1)[1] max_score = 0.1 map_val = "" print("\nINPUT = ") print(user_input) label = classify(user_input) suggest_list = [] suggest_message = "" print("Classified as : " + str(label)) tokens = nltk.word_tokenize(user_input) print(tokens) st = StanfordPOSTagger(config['tagger']['model'], path_to_jar=config['tagger']['path']) stanford_tag = st.tag(user_input.split()) print("Tags") print(stanford_tag) with open(MAPPING_PATH, 'r') as data_file: data = json.load(data_file) for i in data[label]: dist = jf.jaro_distance(unicode(str(user_input), encoding="utf-8"), unicode(str(i), encoding="utf-8")) suggest_list.append(tuple((dist, i))) print(dist) if (dist > max_score): max_score = dist map_val = i if max_score < config['preferences']['similarity_threshold']: post_message(
def build_data_cv(data_folder, cv=10, clean_string=True): """ Loads data and split into 10 folds. """ revs = [] pos_file = data_folder[0] neg_file = data_folder[1] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') with open(pos_file, "rb") as f: revs_text = [] for line in f: rev = [] rev.append(line.strip()) if clean_string: orig_rev = clean_str(" ".join(rev)) else: orig_rev = " ".join(rev).lower() revs_text.append(orig_rev.split()) revs_tagged = pos_tagger.tag_sents(revs_text) for rev_tagged in revs_tagged: text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) words = set(text) for word in words: vocab[word] += 1 postags = set(tag) for postag in postags: pos_vocab[postag] += 1 datum = {"y": 1, "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": np.random.randint(0, cv)} revs.append(datum) with open(neg_file, "rb") as f: revs_text = [] for line in f: rev = [] rev.append(line.strip()) if clean_string: orig_rev = clean_str(" ".join(rev)) else: orig_rev = " ".join(rev).lower() revs_text.append(orig_rev.split()) revs_tagged = pos_tagger.tag_sents(revs_text) for rev_tagged in revs_tagged: text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) words = set(text) for word in words: vocab[word] += 1 postags = set(tag) for postag in postags: pos_vocab[postag] += 1 datum = {"y": 0, "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": np.random.randint(0, cv)} revs.append(datum) return revs, vocab, pos_vocab
from lexnlp.nlp.en.tokens import STOPWORDS, get_lemma_list from lexnlp.config.stanford import STANFORD_POS_PATH __author__ = "ContraxSuite, LLC; LexPredict, LLC" __copyright__ = "Copyright 2015-2018, ContraxSuite, LLC" __license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/master/LICENSE" __version__ = "0.2.3" __maintainer__ = "LexPredict, LLC" __email__ = "*****@*****.**" # Setup Stanford POS configuration STANFORD_POS_FILE = os.path.join(STANFORD_POS_PATH, "stanford-postagger.jar") STANFORD_TOKENIZER = StanfordTokenizer(path_to_jar=STANFORD_POS_FILE) STANFORD_DEFAULT_TAG_MODEL = os.path.join( STANFORD_POS_PATH, "models", "english-bidirectional-distsim.tagger") STANFORD_TAGGER = StanfordPOSTagger(STANFORD_DEFAULT_TAG_MODEL, STANFORD_POS_FILE) def get_tokens_list(text, lowercase=False, stopword=False) -> List: """ Get token list form text using Stanford libraries. :param text: :param lowercase: :param stopword: :return: """ if not is_stanford_enabled(): raise RuntimeError( "USE_STANFORD is set to False. No Stanford functionality available." )
# 汉语命名实体识别 ch_tagger = StanfordNERTagger( r'/Users/cln/stanford-corenlp/stanford-ner/classifiers/chinese.kbp.distsim.crf.ser.gz', path_to_jar=r'/Users/cln/stanford-corenlp/stanford-ner/stanford-ner.jar') texts = r"欧洲 东部 的 罗马尼亚 首都 是 布加勒斯特 也 是 一 座 世界性 的 城市 北京 南阳 普京 中国 习主席" ch_rst = ch_tagger.tag(texts.split()) print('汉语命名实体识别:\n', ch_rst, '\n') from nltk.tag import StanfordPOSTagger # 汉语词性标注 chi_tagger = StanfordPOSTagger( r'/Users/cln/stanford-corenlp/postagger/models/chinese-distsim.tagger', path_to_jar=r'/Users/cln/stanford-corenlp/postagger/stanford-postagger.jar' ) print("汉语词性标注:") print(chi_tagger.tag(ch_result.split())) for _, word_and_tag in chi_tagger.tag(ch_result.split()): word, tag = word_and_tag.split('#') print(word, tag) print('\n') from nltk.parse.stanford import StanfordParser from nltk import Tree # 汉语句法分析 chi_parser = StanfordParser( r"/Users/cln/stanford-corenlp/parser/stanford-parser.jar",
def stopw(document): stop = stopwords.words('english') res = ' '.join([i for i in document.split() if i not in stop]) return res text = """ harpdog brown is a singer and harmonica player who has been active in canadas blues scene since 1982 hailing from vancouver he crossed tens of thousands of miles playing club dates and festivals in canada the northwestern united states and germanyover the years he has issued seven cds in 1995 his home is where the harp is won the muddy award for the best nw blues release from the cascade blues association in portland oregon as well that year it was nominated for a canadian juno for the best bluesgospel recording teamed up with graham guest on piano his cd naturally was voted 1 canadian blues album of 2010 by the blind lemon surveybrown tours extensively with his guitarist j arthur edmonds performing their electric mid1950s chicago blues either as a duo or with the full band while he is home he juggles a few combos working many venues big and small he also leads the harpdog brown band which is a gutsy traditional chicago blues band in 2014 they released what it is comprising mainly original songs and a few classic covers influential blues promoter and broadcaster holger petersen called what it is browns best albumhe was just awarded the maple blues award in toronto for best harmonica player in canada 2014 and was honored with a life time membership to the hamilton blues society """ # Add the jar and model via their path (instead of setting environment variables): jar = 'D:\\Python\\SRC\\stanford-postagger-full-2018-10-16\\stanford-postagger.jar' model = 'D:\\Python\\SRC\\stanford-postagger-full-2018-10-16\\models\\english-left3words-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') clean = stopw(text) text = pos_tagger.tag(word_tokenize(clean)) print(text) st = StanfordNERTagger( 'D:\\Python\\SRC\\stanford-ner-2018-10-16\\classifiers\\english.all.3class.distsim.crf.ser.gz', 'D:\\Python\\SRC\\stanford-ner-2018-10-16\\stanford-ner.jar', encoding='utf-8') initText = clean.title() tokenized_text = word_tokenize(initText) tokenized_text classified_text = st.tag(tokenized_text) classified_text
def __init__(self): self.feat_index = {} self.implication_words = ["demonstrate", "suggest", "indicate"] self.hyp_words = ["possible"] self.method_words = ["probe", "detect"] self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
def build_data_cv(data_file, all_phrases, binary, min_len=4): revs = [] vocab = defaultdict(float) pos_vocab = defaultdict(float) pos_tagger = StanfordPOSTagger( 'pos-tag/english-left3words-distsim.tagger', 'pos-tag/stanford-postagger.jar', 'utf8', False, '-mx2000m') splits = ['train', 'test', 'dev'] sentence_set = set() for split in splits: with open(data_file.format(split), "rb") as f: reader = csv.reader(f) revs_text = [] sents = [] for row in reader: rev, sent = row[0], int(row[1]) if binary and sent == 2: # skip neutral if binary continue rev = clean_str_sst(rev) if split == 'train': sentence_set.add(rev) rev_tokens = rev.split() revs_text.append(rev_tokens) sent = sentiment_label_for_binary(sent) if binary else sent # check for binary case sents.append(sent) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 rev_datum = {"y": sents[i], "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": get_split_num(split)} revs.append(rev_datum) if all_phrases: with open(data_file.format("train_phrases"), "rb") as f: reader = csv.reader(f) revs_text = [] sents = [] count = 0 for row in reader: rev, sent = row[0], int(row[1]) rev = clean_str_sst(rev) if rev in sentence_set: count += 1 continue if binary and sent == 2: # skip neutral if binary continue rev_tokens = rev.split() if len(rev_tokens) < min_len: continue revs_text.append(rev_tokens) sent = sentiment_label_for_binary(sent) if binary else sent # check for binary case sents.append(sent) revs_tagged = pos_tagger.tag_sents(revs_text) for i in range(len(revs_tagged)): rev_tagged = revs_tagged[i] text = list(zip(*rev_tagged)[0]) tag = list(zip(*rev_tagged)[1]) for word in set(text): vocab[word] += 1 for postag in set(tag): pos_vocab[postag] += 1 rev_datum = {"y": sents[i], "text": ' '.join(text), "tag": ' '.join(tag), "num_words": len(text), "split": get_split_num('train')} revs.append(rev_datum) print "{} sentences in phrases".format(count) return revs, vocab, pos_vocab
from nltk.tag import StanfordPOSTagger from nltk.tokenize import word_tokenize STANFORD_POS_MODEL_PATH = '압축을 푼 장소/models/english-bidirectional-distsim.tagger' STANFORD_POS_JAR_PATH = '압축을 푼 장소/stanford-postagger-3.6.0.jar' pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH) # 임의로 만들어낸 예제입니다. 이 부분을 원하는 문장으로 바꿔서 실습하세요. text = 'One day in November 2016, the two authors of this book, Seungyeon and Youngjoo, had a coffee at Red Rock cafe, which is a very popular place in Mountain View.' tokens = word_tokenize(text) print(tokens) # 쪼개진 토큰을 출력합니다. print() print(pos_tagger.tag(tokens)) # 품사 태깅을 하고 그 결과를 출력합니다. # 동사와 명사만 뽑아봅시다. noun_and_verbs = [] for token in pos_tagger.tag(tokens): if token[1].startswith('V') or token[1].startswith('N'): noun_and_verbs.append(token[0]) print(', '.join(noun_and_verbs))
# encoding: utf-8 import nltk from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tag import StanfordPOSTagger segmenter=StanfordSegmenter( #分词依赖的jar包 path_to_jar=r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar", path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar", #分词数据文件夹 path_to_sihan_corpora_dict=r"/home/jiangix/document/stanford-segmenter/data", #基于北大在2005backoof上提供的人名日报语料库 path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz", path_to_dict=r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz" ) segmenter.default_config('zh') result=segmenter.segment(u'我喜欢学习编程') chi_tagger = StanfordPOSTagger( model_filename=r"/home/jiangix/document/stanford-postagger/models/chinese-distsim.tagger", path_to_jar=r"/home/jiangix/document/stanford-postagger/stanford-postagger.jar") print(chi_tagger.tag(result.split()))