class Lemmatizer(AbstractStemmer): def __init__(self, ): super(Lemmatizer, self).__init__() self.basename = 'lemmatized' self.pos_tagger = StanfordPOSTagger( 'english-left3words-distsim.tagger', java_options='-mx1024m') self.lemmatizer = WordNetLemmatizer() self.max_length = 500 def process(self, words): current_sentence = [] pos_words = [] for word in words: current_sentence.append(word) if word in '.!?' and len(current_sentence) > self.max_length: try: pos_words += self.pos_tagger.tag(current_sentence) except Exception: print 'Broke on', current_sentence raise current_sentence = [] for i in range(len(current_sentence) / self.max_length): try: pos_words += self.pos_tagger.tag( current_sentence[:self.max_length]) except Exception: print 'Broke on', current_sentence[:self.max_length] raise current_sentence = current_sentence[self.max_length:] try: pos_words += self.pos_tagger.tag(current_sentence) except Exception: print 'Broke on', current_sentence raise processed_words = [ self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps)) for wd, ps in pos_words ] return processed_words # from http://stackoverflow.com/questions/15586721 def get_wn_pos(self, treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN
class Lemmatizer(AbstractStemmer): def __init__(self, ): super(Lemmatizer, self).__init__() self.basename = 'lemmatized' self.pos_tagger = StanfordPOSTagger('english-left3words-distsim.tagger', java_options='-mx1024m') self.lemmatizer = WordNetLemmatizer() self.max_length = 500 def process(self, words): current_sentence = [] pos_words = [] for word in words: current_sentence.append(word) if word in '.!?' and len(current_sentence) > self.max_length: try: pos_words += self.pos_tagger.tag(current_sentence) except Exception: print 'Broke on', current_sentence raise current_sentence = [] for i in range(len(current_sentence) / self.max_length): try: pos_words += self.pos_tagger.tag(current_sentence[:self.max_length]) except Exception: print 'Broke on', current_sentence[:self.max_length] raise current_sentence = current_sentence[self.max_length:] try: pos_words += self.pos_tagger.tag(current_sentence) except Exception: print 'Broke on', current_sentence raise processed_words = [self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps)) for wd, ps in pos_words] return processed_words # from http://stackoverflow.com/questions/15586721 def get_wn_pos(self, treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN
def emotion_pos_tagging(): tag_target = ['V', 'N', 'J', 'R'] tag_list = [] # 단어 사전 엑셀 파일 입력 df_emotion = open_emotion_dataframe() # 품사 태깅 for word in df_emotion['영어']: STANFORD_POS_MODEL_PATH = "path/english-bidirectional-distsim.tagger" STANFORD_POS_JAR_PATH = "path/stanford-postagger-3.9.2.jar" pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH) pos = pos_tagger.tag([word]) tag_first = pos[0][1][0] if tag_first in tag_target: if tag_first == 'V': tag_list.append('동사') if tag_first == 'N': tag_list.append('명사') if tag_first == 'J': tag_list.append('형용사') if tag_first == 'R': tag_list.append('부사') else: tag_list.append('') df_emotion['품사'] = tag_list # 품사 태깅한 확장 단어 사전 데이터프레임 출력 df_emotion.to_excel(f"../res/dic/감정 단어.xlsx")
def update_training_data(usr_input,label,command): format_input = "" st = StanfordPOSTagger(config['tagger']['model'],path_to_jar=config['tagger']['path']) tags = st.tag(usr_input.split()) print(tags) with open(MAPPING_PATH,'r') as data_file: data = json.load(data_file) for pos,tag in enumerate(tags): if(tag[1] != "NNP"): format_input += tag[0] format_input += " " data[label].append(format_input) with open(MAPPING_PATH, "w") as jsonFile: jsonFile.write(json.dumps(data, sort_keys=False, indent=4)) with open(TRAINDATA_PATH,'r') as data_file: data = json.load(data_file) add_dict = { "text" : format_input, "label" : label } data.append(add_dict) with open(TRAINDATA_PATH, "w") as jsonFile: jsonFile.write(json.dumps(data, sort_keys=False, indent=4)) with open(COMMAND_PATH,'r') as data_file: data = json.load(data_file) add_dict = { format_input : command } data[label].update(add_dict) with open(COMMAND_PATH,"w") as jsonFile: jsonFile.write(json.dumps(data, sort_keys=False, indent=4)) print('Added')
def handleMessage(sid, txt): tagger = StanfordPOSTagger(_path_to_model, path_to_jar=_path_to_jar, java_options='-mx4096m') tagged = tagger.tag(nltk.word_tokenize(txt)) responseMessage = str(tagged) sendResponse(sid, responseMessage)
def get_postag_with_index(sources, idx2word, word2idx): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx in xrange(len(sources)): # len(test_data_plain) test_s_o = sources[idx] source_text = keyphrase_utils.cut_zero(test_s_o, idx2word) text = pos_tagger.tag(source_text) print('[%d/%d] : %s' % (idx, len(sources), str(text))) tagged_source.append(text) return tagged_source
def call_reia(): max_score = 0.1 map_val = "" print('-----------------------') user_input = raw_input("enter the string: ") #user_name = get_username(first_line.split(' ', 1)[0]) suggest_list = [] suggest_message = "" #prev_ts = ts print("\nINPUT = ") print(user_input) label = classify(user_input) if label == "": post_message( "Sorry, I could not understand. Please rephrase and try again.") consume_message() print("Classified as : " + str(label)) tokens = nltk.word_tokenize(user_input) print(tokens) st = StanfordPOSTagger(config['tagger']['model'], path_to_jar=config['tagger']['path']) stanford_tag = st.tag(user_input.split()) print("Tags") print(stanford_tag) """with open(MAPPING_PATH,'r') as data_file:
def new_load_data(f_name): data = {} import os java_path = "C:/Program Files/Java/jdk1.8.0_121/bin/java.exe" os.environ['JAVAHOME'] = java_path st = StanfordPOSTagger('english-bidirectional-distsim.tagger', 'stanford-postagger.jar', encoding='utf-8') with open(f_name, 'r') as file: for line in file: fields = line.split('\t') sent_id = fields[0] """if sent_id == 'sent1656': print('yay')""" data[sent_id] = {} data[sent_id][SENTENCE] = fields[1].strip('\n').split() data[sent_id][ENTITIES] = {} tokenized_sent = nltk.sent_tokenize(fields[1]) for sent in tokenized_sent: chunk_id = 0 for chunk in nltk.ne_chunk(st.tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label'): data[sent_id][ENTITIES][chunk_id] = ( chunk.label(), ' '.join(c[0] for c in chunk)) chunk_id += len([c[0] for c in chunk]) print(chunk.label(), ' '.join(c[0] for c in chunk)) else: chunk_id += 1 # assert chunk_id < len(fields[1].split()) #sent = st.tag(fields[1].split()) #print(sent) return data
def TagProb(Readfile, file): if path.exists(sys.path[0] + '/Preparation/save/data/' + file): remove(sys.path[0] + '/Preparation/save/data/' + file) tagger = StanfordPOSTagger(model_filename, path_to_jar) WordDict = {} for line in open(Readfile): sentence = tagger.tag(line.split()) for WordTag in sentence: if WordTag[0] not in WordDict.keys(): WordDict[WordTag[0]] = {} WordDict[WordTag[0]][WordTag[1]] = 1 else: if WordTag[1] not in WordDict[WordTag[0]].keys(): WordDict[WordTag[0]][WordTag[1]] = 1 else: WordDict[WordTag[0]][ WordTag[1]] = 1 + WordDict[WordTag[0]][WordTag[1]] for word in WordDict.keys(): sum_freq = 0 for tag in WordDict[word].keys(): sum_freq = WordDict[word][tag] + sum_freq for tag in WordDict[word].keys(): WordDict[word][tag] = WordDict[word][tag] / sum_freq with open(file, 'a', encoding='utf-8') as Writer: for word in WordDict.keys(): Writer.write(str(word) + ':' + str(WordDict[word]) + '\n') return WordDict
def features(text): # POS-Tagging tagged = StanfordPOSTagger(model_filename=model_filename, path_to_jar=path_to_jar, encoding='utf8', verbose=False, java_options='-mx3000m') classified_word = tagged.tag(nltk.word_tokenize(text)) text_postags = [] for index_classified in classified_word: text_postags.append(index_classified[1]) freq_pos = nltk.FreqDist(text_postags) adverb, adjective, noun, pronoun, verb = 0, 0, 0, 0, 0 for index_freq in freq_pos.most_common(len(freq_pos)): if index_freq[0] in ["RB", "RBR", "RBS"]: adverb += index_freq[1] elif index_freq[0] in ["JJ", "JJR", "JJS"]: adjective += index_freq[1] elif index_freq[0] in ["NN", "NNS", "NNP", "NNPS"]: noun += index_freq[1] elif index_freq[0] in ["PRP", "PRP$"]: pronoun += index_freq[1] elif index_freq[0] in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: verb += index_freq[1] X_test = [] X_test.extend( (adverb / adjective, adverb / noun, adverb / pronoun, adjective / verb, adjective / pronoun, noun / verb, noun / pronoun, verb / pronoun)) return X_test
def get_postag_with_record(records, pairs): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path) - 10) + 1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain) print('*' * 100) print('File: ' + record['name']) print('Input: ' + str(pair[0])) text = pos_tagger.tag(pair[0]) print('[%d/%d][%d] : %s' % (idx, len(records), len(pair[0]), str(text))) tagged_source.append(text) return tagged_source
def _preprpcessing_eng(id_list): stop_w = set(stopwords.words('english')) eng_tagger = StanfordPOSTagger( model_filename= '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/models/english-bidirectional-distsim.tagger', path_to_jar= '/home/zhouh/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger.jar' ) for i in id_list: try: text = read_file('/home/zhouh/Thesis/code/Transcripts/english/' + i + '.txt') words = nt.word_tokenize(text, language='english') word = [x for x in words if x not in string.punctuation] word = [x for x in word if x not in stop_w] word = [x for x in word if not x.isdigit()] word = eng_tagger.tag(word) tt = '' for w in word: tt += '/'.join(w) + ' ' new_path = '/home/zhouh/Thesis/code/Transcripts/eng_preprocessed/' + i + 'pre.txt' if os.path.exists(new_path): os.remove(new_path) with open(new_path, 'w') as f: f.write(tt) except: continue
def genere_liste_natures(l_auteurs, STANFORD_PARSER='../stanford', STANFORD_MODELS='../stanford', JAVAHOME='/import/lhauseux/jre1.8.0_45/bin', bid = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/models/english-bidirectional-distsim.tagger', pt = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/stanford-postagger.jar'): # On règle les paramètres du prog' de Stangord et de java : os.environ['STANFORD_PARSER'] = STANFORD_PARSER os.environ['STANFORD_MODELS'] = STANFORD_MODELS os.environ['JAVAHOME'] = JAVAHOME st = StanfordPOSTagger(bid,path_to_jar=pt,java_options='-mx15000m') nltk.internals.config_java(options='-xmx2G') # On crée le dossier forets si nécessaire if not os.path.isdir('./listes_natures'): os.mkdir('./listes_natures') for auteur in l_auteurs: # On crée le dossier propre à l'auteur si nécessaire if not os.path.isdir('./listes_natures/'+auteur): os.mkdir('./listes_natures/'+auteur) articles = os.listdir('./auteurs/'+auteur) for article in articles: if article != 'liens.txt': # On récupère l'article au format texte f = open('./auteurs/'+auteur+'/'+article,'r') contenu = f.read() f.close() # On le transforme en forêts d'arbres contenu = nltk.word_tokenize(contenu) contenu = st.tag(contenu) contenu = [c[1] for c in contenu] # On enregistre f = open('./listes_natures/'+auteur+'/'+article,'wb') pickle.dump(contenu,f) f.close() print(auteur,article)
def fr_words(DATA_PATH, candidats) : import pandas as pd import operator import nltk from nltk.corpus import stopwords #nltk stanford french tagger from nltk.tag import StanfordPOSTagger jar = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar' model = 'C:/Users/user/Downloads/stanford-postagger-full-2018-02-27/stanford-postagger-full-2018-02-27/models/french.tagger' import os java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe" os.environ['JAVAHOME'] = java_path pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' ) #tokenizer (enlever les # @ et la ponctuation...) tokenizer = nltk.RegexpTokenizer(r'\w+') #lecture des tweets df = pd.read_csv(DATA_PATH) df = df[df['text'].notnull()] a = len(df) fr_words = [[] for i in range (len(candidats))] indesirable = ["RT","https","http","c","les", "et", "ça","coach", "ils","thevoice", "quand", "donc","thevoice_tf1" ] for j in range (len(candidats)): count = dict() candidat = candidats[j] for i in range (0,a) : if i in [ 7224, 16457,16458,22348,22349,22350,22351,22352, 22353,22354,22355] : continue else : line = df.at[i,'text'] tokenized = tokenizer.tokenize(line) # ne garder que les mots qui ne sont pas des stop words (de, que, dans...) # en minuscule words = [ w.lower() for w in tokenized if (w not in stopwords.words('french') and w not in indesirable)] if set(candidat) & set(words): for word in words : if word in count.keys() : count[word] += 1 else : count[word] = 1 else: continue count = sorted(count.items(), key=operator.itemgetter(1), reverse = True) fr_words1 = count [0:50] # enlever tous les verbes for element in fr_words1 : if pos_tagger.tag(element[0].split())[0][1] not in ['VINF','V'] : fr_words[j].append(element) else : continue return fr_words
def main(): with open('/home/abhinav/PycharmProjects/video_enrichment/text.txt', 'r') as myfile: text = myfile.read().replace('\n', '') # text = """Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.""" # text = "Concepts present in text are outline of machine learning, data mining, statistics, cluster analysis, algorithms like logic, pseudo code." text = p.sub('', text) sentences = nltk.sent_tokenize(text) for sentence in sentences: sentence = sentence.lower() # Lower Case the whole sentence sentence = p.sub( '', sentence) # Removing anything enclosed within brackets print(sentence) ## TAGGING st_tag = StanfordPOSTagger(model_filename=eng_model_filename_pos, path_to_jar=my_path_to_pos_jar) tagged_sentence = st_tag.tag(word_tokenize(sentence)) # print(tagged_sentence) ## ENTITY RECOGNITION # st_ner = StanfordNERTagger(model_filename=eng_model_filename_ner, path_to_jar=my_path_to_ner_jar) # print(st_ner.tag('Rami Eid is studying at Stony Brook University in NY'.split())) ## PARSING # print(parsing(sentence)) ## Chunking Using Regex regex_exps = [ "NP: {<JJ|NN.?>*<NN.?>}", "NP: {<JJ|NN.?>*<NN.?><IN>?<JJ|NN.?>*<NN.?>}", "NP: {<JJ>*<NN.?>+}" ] # Include the following pattern to count conjuctions "NP: {<JJ|NN.?>*<NN.?><CC>?<JJ|NN.?>*<NN.?>}" for grammar in regex_exps: IOB_tagged = chunking(tagged_sentence, grammar) remove_IOBtags(IOB_tagged) # print(concept_count) ## Prune concepts on word level using word frequency count on BBC corpus prune_concepts_WordLevel() print("Pruned concepts are:", pruned_concepts) ## Identify Wikipedia articles(titles) that match concepts extracted from the text if Jaccard Similarity is one or if wikipedia title is a part of concept extracted Wikipedia_aritcle_matching() print("\n", concept_wiki_article) print("\nFinal List Of Concepts:", final_wiki_concepts) # prereq_graph.add_nodes_from(final_wiki_concepts) wiki_based_similarity() Connected_components = nx.connected_components(un_prereq_graph) print("\n Pre-req Graph successfully created") # print("\nConnected Components: ") # print(Connected_components) nx.draw(prereq_graph, with_labels=True) plt.axis('off') plt.savefig("graph_prereq.png")
def get_pos_tags(content, stopwords, is_stemming, is_math): # Content should be tokenized pos_tagger_dir = '/usr/users/swli/program/nlp_util/stanford-postagger' model = pos_tagger_dir + '/models/wsj-0-18-bidirectional-distsim.tagger' classpath = pos_tagger_dir + '/stanford-postagger_with_slf4j.jar' tagger = StanfordPOSTagger(model, classpath, java_options='-mx4000m') try: tag_results = tagger.tag(re.split('\s+', content)) except OSError: sentences = re.split('\s+\.\s+', content) tag_results = [] for index in range(len(sentences)): sentence = sentences[index] if index < len(sentences)-1: sentence += ' .' tag_results += get_contaminated_tag_results(sentence, tagger) pos_tags = [] for pair in tag_results: word = pair[0] # map simple equation to tokens if is_math: word = simple_eq_to_text(word) # remove punctuation word = "".join(l for l in word if l not in string.punctuation) word = word.lower() word = process_word(word, stopwords, is_stemming, is_math) if word: pos_tags.append(pair[1]) return pos_tags
def pos_tag(review): eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') tmp = eng_tagger.tag(review) result = [] for element in tmp: result.append(element[1]) return result
def pos_tag(mots, jar=os.path.join(".", "models", "stanford-postagger", "stanford-postagger-3.8.0.jar"), mdl=os.path.join(".", "models", "stanford-postagger", "french-ud.tagger")): try: pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8') except LookupError: java_path = r"C:\Program Files (x86)\Java\jre1.8.0_261\bin\java.exe" os.environ['JAVAHOME'] = java_path pos_tagger = StanfordPOSTagger(mdl, jar, encoding='utf8') tagged = pos_tagger.tag(mots) tags = [g for m, g in tagged] forced_det = ["au", "aux"] absent_of_table = ["PART", "SCONJ"] if any(item in mots for item in forced_det) or any(item in tags for item in absent_of_table): for i, couple in enumerate(tagged): mot = couple[0] gram = couple[1] if mot in forced_det: tagged[i] = (mot, "DET") if gram == "PART": tagged[i] = (mot, "ADV") if gram == "SCONJ": tagged[i] = (mot, "CONJ") return tagged
def call_reia(): while (True): max_score = 0.1 map_val = "" with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt', 'r') as f: first_line = f.readline() while first_line == "": time.sleep(1) call_reia() print('-----------------------') user_input = first_line.split(' ', 1)[1] user_name = get_username(first_line.split(' ', 1)[0]) suggest_list = [] suggest_message = "" #prev_ts = ts print("\nINPUT = ") print(user_input) label = classify(user_input) if label == "": post_message( "Sorry, I could not understand. Please rephrase and try again." ) consume_message() continue print("Classified as : " + str(label)) tokens = nltk.word_tokenize(user_input) print(tokens) st = StanfordPOSTagger(config['tagger']['model'], path_to_jar=config['tagger']['path']) stanford_tag = st.tag(user_input.split()) print("Tags") print(stanford_tag) with open(MAPPING_PATH, 'r') as data_file: data = json.load(data_file) for i in data[label]: dist = jf.jaro_distance(str(user_input), str(i)) suggest_list.append(tuple((dist, i))) print(dist) if (dist > max_score): max_score = dist map_val = i if max_score < config['preferences']['similarity_threshold']: post_message( "Sorry, I could not understand. Please rephrase and try again." ) consume_message() if config['preferences']['suggestions'] == True: suggest = suggestions(suggest_list) post_message("Did you mean :") for i in suggest: suggest_message += (str(i[1]) + "\n") post_message(suggest_message) continue print("\nMapped to : " + map_val) #post_message(map_val) construct_command(user_input, label, tokens, map_val, stanford_tag, exec_command, user_name) #call('sed -i -e "1d " REIA/mqueue.txt') consume_message()
def get_postagger_for_criterion(criterion): ini_path = "/stanford/postagger" os.environ['STANFORD_PARSER'] = ini_path os.environ['STANFORD_MODELS'] = ini_path os.environ['CLASSPATH'] = ini_path st = StanfordPOSTagger('models/english-bidirectional-distsim.tagger') postagger_list = st.tag(criterion) return postagger_list
def standford_pos(text): eng_tagger = StanfordPOSTagger( model_filename= r'D:\Program Files\stanford-corenlp-full\stanford-postagger\models\english-bidirectional-distsim.tagger', path_to_jar= r'D:\Program Files\stanford-corenlp-full\stanford-postagger\stanford-postagger.jar' ) return eng_tagger.tag(text.split())
def getTagged(self, text): from nltk.tag import StanfordPOSTagger if self.lang == 1: jar = 'stanford-pos-tagger/stanford-postagger-3.8.0.jar' model = 'stanford-pos-tagger/french.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') tokenizedText = nltk.word_tokenize(text.lower()) taggedText = pos_tagger.tag(tokenizedText) else: jar = 'stanford-pos-tagger/stanford-postagger-3.8.0.jar' model = 'stanford-pos-tagger/arabic.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') tokenizedText = nltk.word_tokenize(text.lower()) taggedText = pos_tagger.tag(tokenizedText) print(taggedText) return taggedText
class FeatureProcessing(object): def __init__(self): self.feat_index = {} self.implication_words = ["demonstrate", "suggest", "indicate"] self.hyp_words = ["possible"] self.method_words = ["probe", "detect"] self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') def get_features(self, phrase, filter_feature='0'): words = word_tokenize(phrase) pos_tags = self.pos_tagger.tag(words) features = [] for word, tag in pos_tags: wl = word.lower() # Feat 1: POS features if filter_feature != '1': if tag != ',' and tag != '.': features.append(tag) # Feat 2: Verb and adverb identity if filter_feature != '2': if tag == 'RB' or tag.startswith('VB'): features.append(wl) # Feat 3: Presence of figure references and citations if filter_feature != '3': if word.startswith("Fig"): features.append("figure") if re.search("[A-Z][^\s]+ et al.", phrase) is not None: features.append("reference") # Feat 4: Presence of specific words or phrases if filter_feature != '4': if re.search("[Dd]ata not shown", phrase) is not None: features.append("data_not_shown") for word in self.implication_words: if word in phrase: features.append("implication_word") for word in self.hyp_words: if word in phrase: features.append("hyp_word") for word in self.method_words: if word in phrase: features.append("method_word") return features def index_data(self, data, filter_feature='0'): all_features = [self.get_features(datum, filter_feature) for datum in data] for features in all_features: for feat in features: if feat not in self.feat_index: self.feat_index[feat] = len(self.feat_index) def featurize(self, phrase, filter_feature='0'): indexed_features = [0] * len(self.feat_index) features = self.get_features(phrase, filter_feature) for feat in features: if feat not in self.feat_index: continue indexed_features[self.feat_index[feat]] += 1 return indexed_features
def gen_keyphrases(text): # Used when tokenizing words sentence_re = r'''(?x) # set flag to allow verbose regexps (?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens ''' lemmatizer = nltk.WordNetLemmatizer() #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) tokenizer = nltk.RegexpTokenizer(sentence_re) toks = tokenizer.tokenize(text) span_toks = tokenizer.span_tokenize(text) logger.debug(toks) logger.debug("tokens: %(1)d" % {"1": len(toks)}) # old way of tokenization #toks = nltk.regexp_tokenize(text, sentence_re) st = StanfordPOSTagger(config.stanford_bidirectional_tagger_path, config.stanford_postagger_jar_path, encoding="utf8", java_options="-mx8g") _postoks = st.tag(toks) # examine the postags, if "[", then change the tag to "X", create a new list postoks = [] for pt in _postoks: if pt[0] == "[": postoks.append(('[', 'X')) elif pt[0] == "]": postoks.append((']', 'X')) else: postoks.append(pt) logger.info(postoks) # NLTK POS Tagger #postoks = nltk.tag.pos_tag(toks) logger.debug("postoks: %(1)d" % {"1": len(postoks)}) tree = chunker.parse(postoks) # cast a Tree into a ParentedTree ptree = nltk.ParentedTree.convert(tree) # for each token, record its tree position pos_map = generate_pos_map(ptree, span_toks) stopwords = nltk.corpus.stopwords.words('english') return get_terms(ptree, lemmatizer, stopwords, pos_map)
def postagger(): os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/stanford-postagger.jar' os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH'] eng_tagger = StanfordPOSTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/models/english-bidirectional-distsim.tagger') for x in content: print(eng_tagger.tag(x.split()))
def number(sentence): pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') tagged_sentence = pos_tagger.tag(sentence.split()) int_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] numbers = [ word for word, tag in tagged_sentence if ((tag == 'DET' and det_or_nb(word) == 'nb') or (word[0] in int_list) ) ] return (' '.join(numbers))
def _tagging(data): df = pd.read_csv("/var/www/pyapi/scripts/Stanford_POS_Tags.csv") os.environ[ "STANFORD_MODELS"] = "/var/www/pyapi/scpDocs/stanford-postagger-full-2017-06-09/models" spanish_postagger = StanfordPOSTagger( 'spanish.tagger', '/var/www/pyapi/scpDocs/stanford-postagger-full-2017-06-09/stanford-postagger.jar' ) tagged = spanish_postagger.tag(data.split()) return _describe_stanford_pos_tag(tagged, df)
def stanford_pos_tag(text, java_path=None): _setup_java_home(java_path) model_name = "english-caseless-left3words-distsim.tagger" stanfort_dir = get_from_resource("stanford-postagger-full-2018-10-16") jar = str(stanfort_dir.joinpath("stanford-postagger-3.9.2.jar")) model = str(stanfort_dir.joinpath("models/{}".format(model_name))) st = StanfordPOSTagger(model, jar, encoding="utf8") text_tokenized = word_tokenize(text) return st.tag(text_tokenized)
def _tag_words(self, jar, model, cleaned_sentence): """ Arguments: cleaned_sentence Returns: tagged_words: a list containing tuples i.e (word, syntactic value). """ pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8") tagged_words = pos_tagger.tag(word_tokenize(cleaned_sentence)) return tagged_words
def getSolutionInTag(self): english_postagger = StanfordPOSTagger( "/home/geethu/Documents/project anlp/c/models/english-bidirectional-distsim.tagger", "/home/geethu/Documents/project anlp/c/stanford-postagger.jar") print(self.synt) #retag the sentence again new_tag_values = english_postagger.tag(self.words) for index in range(len(self.words)): print(index) word, tag = new_tag_values[index] self.synt[index] = tag
def Text_to_tag(Readfile, file): if path.exists(sys.path[0] + '/Preparation/save/data/' + file): remove(sys.path[0] + '/Preparation/save/data/' + file) tagger = StanfordPOSTagger(model_filename, path_to_jar) for line in open(Readfile): TagList = [] sentence = tagger.tag(line.split()) for WordTag in sentence: TagList.append(WordTag[1]) with open(file, 'a', encoding='utf-8') as Writer: Writer.write(" ".join(TagList) + '\n')
def impp(input_question): try: import numpy as np import os os.getcwd() import pandas as pd import spacy from . import formula nlp = spacy.load('en_core_web_sm') from difflib import SequenceMatcher import re import nltk import pprint pp = pprint.PrettyPrinter(indent=4) from nltk import word_tokenize from nltk.corpus import stopwords path_to_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0.jar' path_to_models_jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-parser-3.8.0-models.jar' jar = '/usr/local/lib/python2.7/dist-packages/nltk/tag/stanford-postagger-3.8.0.jar' model = '/usr/local/lib/python2.7/dist-packages/nltk/tag/models/english-left3words-distsim.tagger' from nltk.parse.corenlp import CoreNLPParser from nltk.tag import StanfordNERTagger from nltk.parse.stanford import StanfordParser from nltk.parse.stanford import StanfordDependencyParser from nltk.stem import PorterStemmer from nltk.tokenize import sent_tokenize from nltk.tag import StanfordPOSTagger pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) #print ("1") #print (os.path.exists('/home/piut/django-apps/wps/wps/patterns.csv')) #print ("2") pattern=read('patterns.csv') #print ("1") #print pattern question=input_question tagged_question=pos_tagger.tag(nltk.word_tokenize(question)) doc = nlp(question) #print "###################################################################" #print doc #print ("2") result = dependency_parser.raw_parse(question) #pp.pprint(tagged_question) #print ("3") #return str(moreMoney(dependency,doc,pattern,unknown)) unknown=find(tagged_question,question,doc,input_question) if unknown==0: return 0 return unknown # fe except: return 0
def token_after(token, sentence): k = 0 pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') tagged_sentence = pos_tagger.tag(sentence.split()) for i in range(len(tagged_sentence)): if tagged_sentence[i][0] == token: k = i if tagged_sentence[k + 1][1] == 'NC': return (' '.join([tagged_sentence[k + 1][0]])) else: return None
class POSTagger: def __init__( self, path_to_model="/home/james/Downloads/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger", path_to_jar="/home/james/Downloads/stanford-postagger-full-2016-10-31/stanford-postagger.jar" ): self.tagger = StanfordPOSTagger(path_to_model, path_to_jar) def parse(self, line): line = nltk.word_tokenize(line) return self.tagger.tag(line)
def create_pos(self, tweet): self.pos_tweet = None tweet = word_tokenize(tweet) english_pos = StanfordPOSTagger( 'postagger/models/english-bidirectional-distsim.tagger', 'postagger/stanford-postagger.jar') self.pos_tweet = english_pos.tag(tweet) return self.pos_tweet
def pos_tagging(sentence): english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar') VP_list=[] POS_list=english_postagger.tag(sentence.split()) '''for i in range(0, len(POS_list)): if POS_list[i][1] in ['NNS','NNP','NNPS']: NP_list.append(POS_list[i][0])''' return POS_list
def get_pos_tag(sen):#pass sentence dataframe st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
def pos_person_tagging(sentence): #Setting the path and jar files for the POS Tagger english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar') NP_list=[] POS_list=english_postagger.tag(sentence.split()) for i in range(0, len(POS_list)): if POS_list[i][1] in ['NNS','NNP','NNPS']: NP_list.append(POS_list[i][0]) return NP_list
def main(): initialize() # create tagger model = '../stanford-postagger/models/chinese-distsim.tagger' jar = '../stanford-postagger/stanford-postagger.jar' zhPOS = StanfordPOSTagger(model, jar) # streaming model: process each line in turn with io.open(INFILE, 'r', encoding='utf8') as qts, io.open(OUTFILE, 'w', encoding='utf8') as pos: for line in qts: qtsPOS = zhPOS.tag(line) s = " ".join("%s" % tup[1] for tup in qtsPOS) + "\n" pos.write(s) return()
class POSTagger: def __init__(self, tagger_path, model_path, output_filename): self.st = StanfordPOSTagger(tagger_path, model_path) self.output_filename = output_filename try: os.remove(self.output_filename) except OSError: pass def output_knowledge(self, sentence): sentence += " ." s = "" with open(self.output_filename, "a") as file: for word, pos_tag in self.st.tag(sentence.split()): file.write(("%s\t%s\n" % (word, pos_tag)).encode("utf-8")) file.write("\n")
def get_pos_tag(sen): os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models' st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar= '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models') stanford_dir = st._stanford_jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) st._stanford_jar = ':'.join(stanford_jars) for i in list(sen.index.get_values()): t=st.tag(sen.loc[i,'Arg'].split()) tags=[] for j in range(0,len(t)): tags.append(t[j][1]) #print i sen.set_value(i,'POStag',tags) return sen
from nltk.tag import StanfordPOSTagger from nltk.tokenize import word_tokenize STANFORD_POS_MODEL_PATH = '압축을 푼 장소/models/english-bidirectional-distsim.tagger' STANFORD_POS_JAR_PATH = '압축을 푼 장소/stanford-postagger-3.6.0.jar' pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH) # 임의로 만들어낸 예제입니다. 이 부분을 원하는 문장으로 바꿔서 실습하세요. text = 'One day in November 2016, the two authors of this book, Seungyeon and Youngjoo, had a coffee at Red Rock cafe, which is a very popular place in Mountain View.' tokens = word_tokenize(text) print(tokens) # 쪼개진 토큰을 출력합니다. print() print(pos_tagger.tag(tokens)) # 품사 태깅을 하고 그 결과를 출력합니다. # 동사와 명사만 뽑아봅시다. noun_and_verbs = [] for token in pos_tagger.tag(tokens): if token[1].startswith('V') or token[1].startswith('N'): noun_and_verbs.append(token[0]) print(', '.join(noun_and_verbs))
buf = 0 for k in range(len(synonymSet_h)): for n in range(len(synonymSet_t)): ##############################modifying function###################### #if synonymSet_h[k].wup_similarity(synonymSet_t[n]!=None): # x.append(synonymSet_h[k].wup_similarity(SynonymSet_t[n])) if synonymSet_h[k].wup_similarity(synonymSet_t[n])!=None: if buf<synonymSet_h[k].wup_similarity(synonymSet_t[n]): buf=synonymSet_h[k].wup_similarity(synonymSet_t[n]) return buf for m in root.findall("pair"): hypothesis=m.findtext("h").casefold() tokenized_hypothesis=nltk.word_tokenize(hypothesis) # tagged_tokenized_hypothesis=nltk.pos_tag(tokenized_hypothesis) #nltk tagger tagged_tokenized_hypothesis=st.tag(tokenized_hypothesis) #stanfordnlp tagger text=m.findtext("t").casefold() tokenized_text=nltk.word_tokenize(text) # tagged_tokenized_text=nltk.pos_tag(tokenized_text) #nltk tagger tagged_tokenized_text=st.tag(tokenized_text) #stanfordnlp tagger output.write("newhypo:\n") for i in range(len(tokenized_hypothesis)): output.write(tagged_tokenized_hypothesis[i][0]) output.write(tagged_tokenized_hypothesis[i][1]) output.write("newtext:\n") for j in range(len(tokenized_text)): output.write(tagged_tokenized_text[j][0]) output.write(tagged_tokenized_text[j][1]) output.write("value:\n") output.write(m.get("entailment")) output.write("\n")
__author__ = 'Anirudh' import codecs import nltk from nltk.tag import StanfordPOSTagger nltk.internals.config_java("C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe") import os java_path = "C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe" os.environ['JAVAHOME'] = java_path # st = StanfordPOSTagger('english-bidirectional-distsim.tagger') st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\arabic.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar') #st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\english-bidirectional-distsim.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar') file="arabic_in.txt" source = codecs.open(file,"r","utf-16-be") destination = codecs.open("utf8encoder_out.txt","wb","utf-8") contents=source.read() destination.write(contents) destination = codecs.open("utf8encoder_out.txt","r","utf-8") contents2=destination.read() print contents2.split() print st.tag(contents2.split())
readFile = (open(filename)).read() paras = readFile.split('\n') parasCopy = [] paraIndex = 0 for paragraph in paras: paraIndex += 1 logging.info('Processing paragraph %d' %paraIndex) if not paragraph == '': name = '' paragraphCopy = "" sentenceList = getSentences(paragraph) sentenceIndex = 0 for sentence in sentenceList: sentenceIndex += 1 logging.info('Processing sentence %d' %sentenceIndex) tokens = POSTagger.tag(sentence.split()) logging.info('POS Tagging of a sentence') nameAnalysis = getName (sentence, tokens) sentenceCopy = sentence if nameAnalysis[0] == '' and nameAnalysis[1] > 0 and not name == '': sentenceCopy = replacePRP(nameAnalysis[2], name, sentence) elif not nameAnalysis[0] == '' and nameAnalysis[3] == 1: name = nameAnalysis[0] if sentenceCopy.count('(') > 0 and not name == '': dateBucket = bracketProcess(sentenceCopy, tokens) sentenceCopy = bracketRemove(sentenceCopy) if not dateBucket == []: date_1 = dateBucket[0] sentence_1 = name + " was born in" for i in date_1: sentence_1 += " " + i
home_path + '/stanford-postagger.jar') url_noun = [] url_not_noun = [] pos = ['NN', 'NNS', 'IN', 'JJ', 'JJS', 'RB', 'TO', 'PRP', 'PRP$', 'NNP', 'NNPS', 'DT', 'VBG', 'VBN', 'VBD'] count = 1 for path in paths: print str(count) + '/' + str(len(paths)) count += 1 isNoun = True print path # remove parameters in path, such as {id}, [id], :id, and split url by level, namely by '/' urls = re.sub('/?[\[{].*?[\]}]|/:\w+', '', path).replace('.json', '').lstrip('/').split('/') for url in urls: for word_pos in st.tag(get_divided_url(url)): # print st.tag(get_divided_url(url)) if word_pos[1] not in pos: url_not_noun.append(path) isNoun = False break if not isNoun: break if not isNoun: continue url_noun.append(path) # save result to swagger_statistic.json swagger_statistic = OrderedDict() swagger_statistic['host'] = host swagger_statistic['basePath'] = basePath
""" ###################################################################################### from nltk.tag import StanfordPOSTagger jar = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar' model = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/models/french.tagger' import os java_path = "C:/Program Files/Java/jdk1.8.0_151/bin/java.exe" os.environ['JAVAHOME'] = java_path pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') words={} tab2 = {} for i in range(5): select=[] n=pos_tagger.tag(tab[i]) stops_verb=['NC','N','NPP'] for x in n: if x[1] in stops_verb: select.append(x[0]) #sel = max(set(select), key=select.count) #tab2[i]=sel words={} for word in set(select): count = 0 for j in range(len(select)): if word == select[j]: count += 1 words[word]= count tab2[i] = (sorted( words.items(), key = lambda x : -x[1]))[:2]
def Process_ZH(File): # Read file with open(File, 'r') as File: # print 'Opened' Input = File.readlines() try: for line in Input: # Checks if identifier is in the line if 'segment' in line: #Sets counter to be on and starts count at 0 Annotations = [] Annotation_Next = False Line = [] Word_Count = 0 # Switch to UTF-8 to ensure accurate counting Line_UTF8_Decode = line.decode('utf-8') Line_Split = Line_UTF8_Decode.split() for Split in Line_Split: if 'feature' in Split: Annotations.append([Split[17:-1], Word_Count, 0]) # print Annotations elif 'state=' in Split: Line_Temp = re.findall('>([^>]*)</', Split) # print 'State_1' if Line_Temp != []: # Ensures that Line_Temp is a string Line_Temp = Line_Temp[0] Word_Count += 1 # print 'State_2' # To ensure nested entities are parsed correctly if Annotations[-1][2] != 0: Annotation_Next = True Length = range(len(Annotations)) for x in Length[::-1]: if Annotations[x][2] == 0 and Annotation_Next == True: Annotations[x][2] = Word_Count Annotation_Next = False else: Annotations[-1][2] = Word_Count # print Annotations elif Line_Temp == [] and '<segment' in Split[15:]: pass else: Word_Count += 1 Line_Temp = Split[15:] # print 'State 3' if Line_Temp != []: Line.append(Line_Temp) elif '</segment>' in Split: Seg_Split = Split.split('</segment>') for x in Seg_Split: if x != '': Word_Count += 1 Line.append(x) elif x == '': if Annotations[-1][2] != 0: # print 'Seg 2' Annotation_Next = True Length = range(len(Annotations)) for x in Length[::-1]: # print Annotations[x][2] if Annotations[x][2] == 0 and Annotation_Next == True: # print 'Seg 3' Annotations[x][2] = Word_Count Annotation_Next = False else: Annotations[-1][2] = Word_Count # print Annotations # if '<' not in Split[0]: # Word_Count += 1 # print Split # Line_Temp = Split[:-10] # print Line_Temp # Line.append(Line_Temp) # # print 'Seg_1' # if Annotations[-1][2] != 0: # # print 'Seg 2' # Annotation_Next = True # Length = range(len(Annotations)) # for x in Length[::-1]: # # print Annotations[x][2] # if Annotations[x][2] == 0 and Annotation_Next == True: # # print 'Seg 3' # Annotations[x][2] = Word_Count # Annotation_Next = False # else: # Annotations[-1][2] = Word_Count # # print Annotations elif '<segment' not in Split: # print Split Line.append(Split) # Checks if Split is a punctuation character if re.findall('[%s]' % zhon.hanzi.punctuation, Split) == [] and Split != ':': Word_Count += 1 Line_Done = ' '.join(Line) # Tags using StanfordPOSTagger ST = StanfordPOSTagger('~/Annotations/models/chinese-distsim.tagger', '~/Annotations/stanford-postagger.jar', encoding='utf-8') Tags = ST.tag(Line) Tags_Done = '' for x in Tags: # print x Tags_Done += x[1][-2:] + ' ' # print Line_Done # print Tags_Done Annotations_Done = '' for x in Annotations: Annotations_Done += str(x[1]) + ',' + str(x[2]) + ',' + str(x[1]) + ',' + str(x[2]) + ' ' + x[0].upper() + '|' # print Annotations_Done with open('Processed_Annotations.txt', 'a') as P_A: P_A.write(Line_Done.encode('utf-8') + '\n') P_A.write(Tags_Done + '\n') P_A.write(Annotations_Done[:-1] + '\n' + '\n') except IndexError: pass
# # # 中文命名实体识别 # chi_tagger = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz') # sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' # for word, tag in chi_tagger.tag(sent.split()): # print word.encode('utf-8'), tag # # # 英文词性标注 from nltk.tag import StanfordPOSTagger # eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') # print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # # 中文词性标注 chi_tagger = StanfordPOSTagger('chinese-distsim.tagger') # sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续' for _, word_and_tag in chi_tagger.tag(sent.split()): word, tag = word_and_tag.split('#') print word.encode('utf-8'), tag # 中英文句法分析 区别在于词库不同 from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split()) for tree in eng_parser.parse(sent): tree.pprint() # 依存关系分析 from nltk.parse.stanford import StanfordDependencyParser eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
resultList.append(3) elif util.is_ending_withComma(token): resultList.append(4) else: resultList.append(5) return resultList if __name__=="__main__": string = open(properties.test_raw).read() str_ = re.sub('[^a-zA-Z0-9\n\.\,\x7f-\xff]', ' ', string) resultList = classLabel(str_.split()) cleaned_test_str = re.sub('[^a-zA-Z0-9\n\x7f-\xff]', ' ', string).lower() postag_t = st.tag(cleaned_test_str.split()) text_file = open(properties.test_tagged_output_file, "w") invokeChunker(cleaned_test_str) chunkTags = extractChunkTags() postag = [] l = [',','...','.','\'','!'] for i in range(len(postag_t)): if postag_t[i][0] not in l: postag.append(postag_t[i]) for i in range(len(postag)): tup = postag[i] token = tup[0] tag = tup[1] chunkTag = chunkTags[i][3]