def tagged_def(): java_path = "C:/ProgramData/Oracle/Java/javapath" os.environ['JAVAHOME'] = java_path tagger = StanfordPOSTagger( 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger', 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger' ) path_data = "data" + os.sep + "items_tagged_modified.json" data = json.load(codecs.open(path_data, encoding='UTF-8')) for item in data: pos2definition = item["pos2definition"] for pos2def in pos2definition: definition = pos2def["definition"] # print chardet.detect(definition) print definition.encode('gbk') definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) tokens = nltk.word_tokenize(definition_pure) # print tokens for token in tokens: print chardet.detect(token) tagged_tokens = tagger.tag(definition_pure.encode('utf-8').split()) pos2def['tagged_def'] = tagged_tokens path_tagged_output = "items_tagged_auto.json" json.dump(data, codecs.open(path_tagged_output, 'w', 'utf-8'), ensure_ascii=False, indent=2)
def getUsername(message, *args): pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8") words = nltk.word_tokenize(message.lower()) tagged_words = pos_tagger.tag(words) sug_usernames = [] # Check if pervious username input is passed if len(args) > 0: previous_username = args[0] sug_usernames = [ word for word, tag in tagged_words if tag in ['NN', 'NNP', 'FW', 'NNPS'] and word != previous_username ] else: sug_usernames = [ word for word, tag in tagged_words if tag in ['NN', 'NNP', 'FW', 'NNPS'] ] if len(sug_usernames) > 0: if getSentenceSentiment(message) == 'pos': return sug_usernames[-1] else: return sug_usernames[ -1] + 'salt123' # return last suggested username return 'randomuser567user'
def extractor(): st = StanfordPOSTagger( '../stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger', '../stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar') nouns = [] pnouns = [] i = 0 with open('../data/scraped_text_NYT.txt', 'r', encoding='utf-8') as inputFile: comment = inputFile.readline() while comment != "": sentences = sent_tokenize(comment, 'english') for sent in sentences: if (sent.strip() == ""): continue pos_tags = st.tag(sent.split()) for pos_tag in pos_tags: if (pos_tag[1] == 'NN' or pos_tag[1] == 'NNS'): nouns = nouns + [pos_tag[0]] elif (pos_tag[1] == 'NNP' or pos_tag[1] == 'NNPS'): pnouns = pnouns + [pos_tag[0]] i = i + 1 print(i) print(comment) comment = inputFile.readline() outFile = open('../data/nouns_scraped_text_NYT.txt', 'a') outFile.write('NOUNS:\n') for noun in nouns: outFile.write(noun + "\n") outFile.write('\n\nPNOUNS:\n') for pnoun in pnouns: outFile.write(pnoun + '\n')
def __init__(self): # stanford ner tagger from nltk.tag.stanford import StanfordNERTagger self.ner_stanford = StanfordNERTagger( '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz', '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar' ) # stanford pos tagger from nltk.tag.stanford import StanfordPOSTagger self.pos_stanford = StanfordPOSTagger( '/home/harish/Documents/softwares/running/corenlp/stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger', '/home/harish/Documents/softwares/running/corenlp/stanford-postagger-full-2015-04-20/stanford-postagger.jar' ) # spacy ner tagger import spacy self.ner_spacy = spacy.load('en') # wordnet lemmatizer from nltk.stem.wordnet import WordNetLemmatizer self.lemmatizer = WordNetLemmatizer() self.tagged_output = {}
def test_StanfordAndNLTKPOS(): import nltk from nltk.tag.stanford import StanfordPOSTagger sent = 'a low-calorie sweetener that reduces caries activity and the growth and transmission of S. mutans.' sent = 'a wire formed by drawing a cast structure through a die; used in dentistry for partial denture clasps and orthodontic appliances.' sent = 'readily stained with acid dyes.' print chardet.detect(sent) # sent='technique metered spray refers to a topical anesthetic dispersal technique that controls the amount and rate at which a drug is administered.' # sent='older term for a traumatic ulcer of the oral mucosa.' # sent='one or more vertically parallel surfaces of abutment teeth shaped to direct the path of placement and removal of a remarkable partial denture. Also called guiding plane.' # sent='agents that bond, seal, or cement particles or objects together.' # sent='teeth that are at such an angle as to cause them to be out of centric contact with opposing teeth during occlusion.' start = datetime.now() text = nltk.word_tokenize(sent) nltk_pos = nltk.pos_tag(text) java_path = "C:/ProgramData/Oracle/Java/javapath" os.environ['JAVAHOME'] = java_path stanford_tagger = StanfordPOSTagger( 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger', 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger' ) stanford_pos = stanford_tagger.tag(text) print 'nltk_pos: ' + str(nltk_pos) print 'stanford_pos: ' + str(stanford_pos)
def tagWordsInSentences(self, studying, entry): '''Tags the part of speech for each word.''' jar_path = 'stanford-postagger-full/stanford-postagger.jar' if studying in self.english: words = parseWordsFromEntry(entry) tagged_words = tagWords(words) return tagged_words elif studying in self.japanese or self.korean or self.mandarin: #segmenter = TinySegmenter() #words = segmenter.tokenize(entry) rm = RakutenMA() tagged_words = rm.tokenize(entry) #mecab = Mecab() #tagged_words = mecab.pos(entry) return tagged_words else: if studying in self.spanish: model_path = 'stanford-postagger-full/models/spanish.tagger' words = parseWordsFromEntry(entry) elif studying in self.french: model_path = 'stanford-postagger-full/models/french.tagger' words = parseWordsFromEntry(entry) postagger = StanfordPOSTagger(model_path, jar_path, encoding='utf8') tagged_words = postagger.tag(words) return tagged_words
def clean_words(tokens, filterStopwords=False, filterPos=None): cleanTokens = [] stopwordList = stopwords.words('spanish') if filterPos: tagger = StanfordPOSTagger('stanford/models/spanish.tagger', 'stanford/stanford-postagger.jar', encoding='utf8') for token in tokens: cleanToken = token for char in string.punctuation: cleanToken = cleanToken.replace(char, "") if filterPos and not filterStopwords: res = tagger.tag([cleanToken]) if len(res)>0: word, pos = res[0] if pos[0] in filterPos: cleanTokens.append(cleanToken) elif filterStopwords and not filterPos: if cleanToken not in stopwordList: cleanTokens.append(cleanToken) elif filterStopwords and filterPos: res = tagger.tag([cleanToken]) if len(res)>0: word, pos = res[0] if cleanToken not in stopwordList and pos[0] in filterPos: cleanTokens.append(cleanToken) elif not filterStopwords and not filterPos: cleanTokens.append(cleanToken) return cleanTokens
def posTagging(): #myNounPhrases = [] myCompletePOSStructure = [] a = ['NNP', 'NNPS'] #Avoid NN,NNS. Only NNP , NNPS for purpose of NER. print '######## POS' english_postagger = StanfordPOSTagger( './Masters-Passau/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger', './Masters-Passau/stanford-postagger-full-2016-10-31/stanford-postagger.jar') #abc = english_postagger.tag('Steve Jobs was Founder of Apple. He was born in United States of America'.split()) abc = english_postagger.tag('Who was the CEO of IBM'.split()) print abc for number in abc: #print number[0],number[1] someTup = (number[0].encode('utf8'),number[1].encode('utf8')) #print someTup myCompletePOSStructure.append(someTup) #print split1[0] + ' ' + split1[1] #print unicodedata.normalize('NFKD', split1[0]).encode('ascii','ignore') #print unicodedata.normalize('NFKD', split1[1]).encode('ascii', 'ignore') print myCompletePOSStructure for number in abc: if any(x in number for x in a): #print number split1 = str(number).split(',') split2 = str(split1[0]).split('u') # print split2[1].replace("'", "") myNounPhrases.append(number)
def __init__(self, translation_id): # Specify paths to Stanford taggers STANFORD_POS_TAGGER_LOCATION = os.environ['STANFORD_POS'] english_modelfile = '{}/models/english-bidirectional-distsim.tagger'.format(STANFORD_POS_TAGGER_LOCATION) spanish_modelfile = '{}/models/spanish-distsim.tagger'.format(STANFORD_POS_TAGGER_LOCATION) jarfile = '{}/stanford-postagger-3.7.0.jar'.format(STANFORD_POS_TAGGER_LOCATION) # Set Translation ID self.translation_id = translation_id # Initialize taggers self.en_tagger = StanfordPOSTagger(model_filename=english_modelfile, path_to_jar=jarfile) self.es_tagger = StanfordPOSTagger(model_filename=spanish_modelfile, path_to_jar=jarfile) # Store the string literals from the VA3 files self.va3l1 = [] self.va3l2 = [] # Store tokenized plaintext sentences self.l1_tok_sent = [] self.l2_tok_sent = [] # Store the alignments as lists of lists of ints self.l1_alignments = [] self.l2_alignments = [] # Stor the POS tags as lists of strings self.l1_pos_tags = [] self.l2_pos_tags = []
def pos_tagging(docs, stanford_path, pos_tagger): print("\nGenerating Part-of-Speech tags...") # Configuring Stanford NLP POS tagger path_to_model = "{}/models/{}.tagger".format(stanford_path, pos_tagger) path_to_jar = "{}/stanford-postagger.jar".format(stanford_path) tagger = StanfordPOSTagger(model_filename=path_to_model, path_to_jar=path_to_jar) # Setting higher memory limit for long sentences tagger.java_options = '-mx8192m' data = [] for doc in progressbar.progressbar(docs): # Obtain the list of tokens in the document tokens = [t for t, label in doc] try: # Perform POS tagging tagged = tagger.tag(tokens) except: continue # Take the word, POS tag, and its label data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)]) return data
def get_tagger(): ''' Set up & return the Stanford Tagger object.''' path_to_model = "/home/avery/Applications/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger" path_to_jar = "/home/avery/Applications/stanford-postagger-2018-02-27/stanford-postagger.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options = "-mx8192m" # Use: tagger.tag(word_tokenize(string)) return tagger
def __init__(self): """ Initializes the tagger object """ self.model = TAGGER_MODEL self.jar_file = POS_TAGGER_JAR_FILE self.tagger = StanfordPOSTagger(self.model, self.jar_file) self.tagger_type = STANFORD_TAGGER_NAME
def part_of_speech_tagging( self, words: List[str], multi_word_name_entities: Set[str]) -> List[Tuple[str, str]]: """ perform part-of-speech tagging using StanfordPOSTagger :param words: a list of words in a sentence :param multi_word_name_entities: a set of multi-word name entities :return: part-of-speech tag of the sentence """ # define pos tagger path_to_model = 'stanford/pos/english-bidirectional-distsim.tagger' path_to_jar = 'stanford/pos/stanford-postagger.jar' pos_tagger = StanfordPOSTagger(path_to_model, path_to_jar) stan_pos_tag = pos_tagger.tag(words[:-1]) # omit the last period normal_pos_tag = nltk.pos_tag(words[:-1]) # omit the last period # print('Stanford POS tagging:', stan_pos_tag) # for comparison # print('nltk.pos_tag tagging:', normal_pos_tag) # for comparison def post_treatment(stan_pos_tag: List[Tuple[str, str]], norm_pos_tag: List[Tuple[str, str]], multi_word_name_entities: Set[str]) -> None: """ combine the multi-word name entities since nltk.pos_tag label multi-word name entities together, so I correct stan_pos_tag by using norm_pos_tag the problem of norm_pos_tag is that it usually mislabels words, and that's why I prefer to use StanfordPOStagger :param stan_pos_tag: a list of pos-tags of sentences using stanford pos tagger :param norm_pos_tag: a list of pos-tags of sentences using nltk.pos_tag """ stan_len = len(stan_pos_tag) norm_len = len(normal_pos_tag) stan_i = 0 norm_i = 0 while stan_i < stan_len and norm_i < norm_len: stan_word, stan_pos = stan_pos_tag[stan_i] norm_word, norm_pos = norm_pos_tag[norm_i] # check if word exists in multi_word_name_entities if stan_word == norm_word.split( ' ')[0] and norm_word in multi_word_name_entities: # scan the following words in stan_pos_tag and combine if they can form a multi-word entity temp_i = stan_i + 1 match_idx = 1 entities = norm_word.split(' ') while temp_i < stan_len and match_idx < len(entities): temp_word, temp_pos = stan_pos_tag[temp_i] if temp_word == entities[match_idx]: _ = stan_pos_tag.pop(temp_i) match_idx += 1 else: break stan_pos_tag[stan_i] = (norm_word, stan_pos) stan_i += 1 norm_i += 1 post_treatment(stan_pos_tag, normal_pos_tag, multi_word_name_entities) return stan_pos_tag
class POSTagger: """POSTagger creates a POS tagger for german language. Different tagger are available to use.""" STAN = "stanford-hgc-tagger" SFT = "stanford-fast-tagger" TT = "tree-tagger" SPACY = "spacy-tagger" # paths to Stanford tagger modules __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar" __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/" def __init__(self, tagger): """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger.""" self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar) if tagger == POSTagger.STAN: self.tagger_name = POSTagger.STAN self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-hgc.tagger") elif tagger == POSTagger.SFT: self.tagger_name = POSTagger.SFT self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-fast.tagger") elif tagger == POSTagger.TT: self.tagger_name = POSTagger.TT self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards elif tagger == POSTagger.SPACY: self.tagger_name = POSTagger.SPACY self.__tagger = spacy.load('de') else: raise Exception("Wrong tagger parameter.") def tag(self, text): """POS tag tokenized text.""" if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN: tokens = self.__tokenizer.tokenize(text) return self.__tagger.tag(tokens) elif self.tagger_name == POSTagger.TT: tags = self.__tagger.tag_text(text) tuple_list = [] tag_list = treetaggerwrapper.make_tags(tags) for item in tag_list: tuple_list.append((item[0], item[1])) return tuple_list elif self.tagger_name == POSTagger.SPACY: tags = self.__tagger(text) tuple_list = [] for word in tags: tuple_list.append((word.orth_, word.tag_)) return tuple_list else: pass #tagger = POSTagger("spacy-tagger") #doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.") #print(tagger.tag("Ich werde morgen in die Schule gehen.")) #print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
def pos_tag(to_tag, stanford_postagger_path): '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file''' pos_tagger = StanfordPOSTagger(stanford_postagger_path +"\\models\\french.tagger", stanford_postagger_path +"\\stanford-postagger.jar", encoding='utf8') #create an object of class POSTagger that is encoded in UTF-8 tags = pos_tagger.tag(to_tag) #run the tagging algorithm on the tokenized raw text return tags
def getTags(sen_arr): tag_arr = [] st = StanfordPOSTagger('english-left3words-distsim.tagger') res = st.tag(sen_arr) for i in res: tag = i[1].encode("utf-8") tag_arr.append(tag) return tag_arr
def determine_sentpos_by_nltk(self, sentence): ''' get pos collection for sentence from nltk ''' pos_model_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/models/english-bidirectional-distsim.tagger" pos_jar_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/stanford-postagger.jar" pos = StanfordPOSTagger(model_filename=pos_model_file, path_to_jar=pos_jar_file) return pos.tag(sentence.split(" "))
def pos_tagger(text): from nltk.tag.stanford import StanfordPOSTagger english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) english_postagger.java_options = '-mx4096m' tags = english_postagger.tag(text) return tags
def posInput(text): print("POS") path_to_model = "./stanford-postagger/models/english-caseless-left3words-distsim.tagger" path_to_jar = "./stanford-postagger/stanford-postagger.jar" tagger=StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options='-mx4096m' ### Setting higher memory limit for long sentences # sentence = 'THIS IS TESTING' result = tagger.tag(word_tokenize(text)) # print result return result
def tagger(self): self.tokenize(self.taggerUse) if self.taggerUse == 'standford': tagger = StanfordPOSTagger('/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/models/spanish-distsim.tagger', '/home/jmutal/dataMining/stanford-postagger-full-2017-06-09/stanford-postagger-3.8.0.jar') tagged_sents = tagger.tag_sents(self.clean_corpus) else: tagged_sents = self.nlp.pipe(self.clean_corpus, n_threads=8) tagged_sents = self.proccess_spacy(tagged_sents) return self.dig2num(tagged_sents)
def transform_wnli(premise,hypothesis): cased_premise=premise premise=[w.lower() for w in nltk.word_tokenize(premise)] #transform WNLI examples back into WSC format hypothesis = [w.lower() for w in nltk.word_tokenize(hypothesis)] best_target=["","","","","",""]#should get overwritten best_masked_s=[] for l in range(len(hypothesis)): for r in range(l+1,l+6): left_part = hypothesis[:l] right_part = hypothesis[r:] pattern = left_part + ["_"]+ right_part for s in range(len(premise)): ok=True if s+len(pattern)>len(premise): break for a,b in zip(pattern,premise[s:s+len(pattern)]): if a=="_": continue if a==b: continue if a in [',','.','?','!'] and b in [',','.','?','!']:#punctuation is ignored continue ok=False break if ok and len(hypothesis[l:r])<=len(best_target): best_target = hypothesis[l:r] best_masked_s = premise[:s]+pattern+premise[s+len(pattern):] if len(best_masked_s)==0:#We failed return None,None #We extracted the masked sentence from the premise. global POS_tagger if POS_tagger is None: os.environ['STANFORD_MODELS'] = "stanford-postagger-2018-10-16/models" os.environ['CLASSPATH'] = "stanford-postagger-2018-10-16" POS_tagger = StanfordPOSTagger("stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger") tagged_premise = POS_tagger.tag(nltk.word_tokenize(cased_premise)) candidates = [] current=[] for word,tag in tagged_premise: if tag in ["NN","NNS","NNP","NNPS"]: current.append(word) else: if current!=[]: candidates.append(" ".join(current).lower()) current=[] if current!=[]: candidates.append(" ".join(current).lower()) best_target=" ".join(best_target) candidates=[c for c in candidates if c.find(best_target)==-1 and best_target.find(c)==-1] candidates = [best_target]+candidates found_sentence = " ".join(best_masked_s).replace(" n't","n't").replace(" 's","'s")#Sorry nltk return found_sentence,candidates
def tag(tokens): #java_path = "C:/Program Files/Java/jdk1.8.0_31/bin/java.exe" #os.environ['JAVAHOME'] = java_path special_symbols_array = ["the", "a", "an"] english_postagger = StanfordPOSTagger( 'tagger/english-bidirectional-distsim.tagger', 'tagger/stanford-postagger.jar') token_tag_array = english_postagger.tag(tokens) for element in token_tag_array: if element[0].lower() in special_symbols_array: token_tag_array.remove(element) return token_tag_array
def __init__(self): # user need to download Stanford Parser, NER and POS tagger from stanford website self.constituent_parse_tree = StanfordParser() #user need to set as environment variable self.stanford_dependency = StanfordDependencyParser() #user need to set as environment variable self.lemma = WordNetLemmatizer() self.home = '/home/ramesh' #user needs to download stanford packages and change directory self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar') self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar') self.CharacterOffsetEnd = 0 self.CharacterOffsetBegin = 0
def __init__(self): # print "Inside ntlk util" self.constituent_parse_tree = StanfordParser() self.stanford_dependency = StanfordDependencyParser() self.lemma = WordNetLemmatizer() self.home = '/home/ramesh/Documents/mas_course/second_semester/rnd/rnd_submission_cd' self.ner = StanfordNERTagger(self.home + '/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz',self.home + '/stanford-ner-2017-06-09/stanford-ner.jar') self.pos_tag = StanfordPOSTagger(self.home + '/stanford-postagger-2017-06-09/models/english-bidirectional-distsim.tagger',self.home + '/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar') self.CharacterOffsetEnd = 0 self.CharacterOffsetBegin = 0 self.contractions = {"'nt":"not", "'ll": " will", "'re":"are", "'ve":"have", "'m":"am"}
def create_pos(self, tweet): self.pos_tweet = None tweet = word_tokenize(tweet.lower()) english_pos = StanfordPOSTagger( 'postagger/models/english-bidirectional-distsim.tagger', 'postagger/stanford-postagger.jar') self.pos_tweet = english_pos.tag(tweet) return self.pos_tweet
def __init__(self, posTagger, filePath, saveExtension, loadExtension, plainExtension, contextLength, replacementFrame, replacementFrameExtensions, replacementCount): self.posTagger_Model = posTagger + "/models/english-bidirectional-distsim.tagger" self.posTagger_Jar = posTagger + "/stanford-postagger.jar" self.filePath = filePath self.savePath = filePath + saveExtension self.loadPath = filePath + loadExtension self.plainPath = filePath + plainExtension self.contextLength = contextLength self.replacementFrame = replacementFrame self.replacementFrameExtensions = replacementFrameExtensions self.replacementCount = replacementCount self.allTokens = [] self.tagger = StanfordPOSTagger(self.posTagger_Model, self.posTagger_Jar, java_options='-mx1000m -Xmx1028m')
def set_pos_tagger(self, model_path=POS_MODEL, jar_path=POS_JAR): """Setup path for Standford POS tagger. Default value is configured in cfg.py Args: model_path: path to the trained model jar_path: path to the JAR """ model_path = os.path.join(self.lib_path, model_path) jar_path = os.path.join(self.lib_path, jar_path) if os.path.isfile(model_path) and os.path.isfile(jar_path): self.pos_tagger = StanfordPOSTagger(model_path, jar_path) else: raise IOError('Cannot find POS tagging lib')
def _POS(self, txt, id): self.df[['ID', 'pos']].to_csv('pos_ner.csv', sep='\t') path_pos = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/stanford-postagger.jar' model_path = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger' from nltk.tag.stanford import StanfordPOSTagger tagger = StanfordPOSTagger(model_path, path_pos) tagger.java_options = '-mx8096m' ### Setting higher memory limit for long sentences tokens = nltk.word_tokenize(txt) pos_res = tagger.tag(tokens) filepath = '/home/ise/NLP/NLP3/pos/pos_{}.txt'.format(id) with open(filepath, 'w') as file_handler: for item in pos_res: file_handler.write("{}\n".format(item)) return pos_res
def extractPOS(inputFile_data, inputFile_tags, inputFile_version, outputFile_pos): f = open(inputFile_tags) allTags = set(f.read().split(",")) # Load all tags f.close() f = open(inputFile_version) lines = f.readlines() f.close() tag_version = [] # tags with version number for index, row in enumerate(lines): items = row.strip().split() if items[0] in allTags: for tag in items[1].split(","): tag_version.append(tag) print "The number of tag_version is: ", len(tag_version) tag_version = set(tag_version) fw_pos = open(outputFile_pos, "w") english_postagger = StanfordPOSTagger( '/Users/songshuaichen/Downloads/jars/models/english-bidirectional-distsim.tagger' ) f = open(inputFile_data) lines = f.readlines() f.close() for index, row in enumerate(lines): if index % 300 == 0: print index, " Finish ", float(index) / len(lines) items = row.strip().split(" ") # if index >=5000 and index < 6000 and items[0] in tag_version: if items[0] in tag_version: fw_pos.write(str(index) + " " + items[0] + " \n") if items[0] not in tag_version: fw_pos.write(str(index) + " " + items[0] + " ") if len(items) > 1: text = items[1].split(". ")[0].decode('utf-8') pos = english_postagger.tag(text.split()) for p in pos: fw_pos.write(str(p)) fw_pos.write(" ") fw_pos.write("\n") fw_pos.close()
def transform_to_pos(text): import os #os.environ['JAVAHOME'] = java_path from nltk.corpus import sentiwordnet as swn from nltk.tag.stanford import StanfordPOSTagger from nltk import word_tokenize path_to_model = "./postagging/english-bidirectional-distsim.tagger" path_to_jar = "./postagging/stanford-postagger.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences tokens = word_tokenize(text) size = len(tokens) from collections import Counter pos = tagger.tag(tokens) counts = Counter(tag for word, tag in pos) for key in counts: counts[key] /= size counts["totalWordsCount"] = size counts[";"] = tokens.count(";") / size counts["questionmarks"] = tokens.count("?") / size counts["exclamationmarks"] = tokens.count("!") / size counts["Quotes"] = tokens.count("\"") / size try: counts.pop(".") except: pass from collections import OrderedDict ot = [ 'NNP', 'VBD', 'VBN', 'IN', 'CD', 'VBP', ',', 'DT', 'NN', 'JJ', 'RB', 'TO', 'SYM', 'PRP', 'NNS', 'CC', 'PRP$', 'POS', 'FW', 'VBG', ':', 'WRB', 'EX', 'JJR', 'WDT', 'totalWordsCount', ';', 'questionmarks', 'exclamationmarks', 'Quotes' ] counts = OrderedDict(counts) for key in ot: if key in counts: pass else: counts[key] = 0 tmp = counts.copy() for key in tmp: if key not in ot: counts.pop(key, None) dab = {} for i in ot: dab[i] = counts[i] counts = dab.copy() return counts
def build_question_set(): sv_file = 'data/kprestval_pos_tags.json' st = StanfordPOSTagger('english-bidirectional-distsim.tagger') meta = load_and_process_metadata('val') images = split_data_by_seed(meta, 'kprestval') num = len(images) pos_tags_dict = {} for i, info in enumerate(images): question_id = info.question_id question = info.question.lower() _pos_tags = st.tag(word_tokenize(question)) pos_tags_dict[question_id] = _pos_tags print('\nPOS TAGGER: %d/%d' % (i, num)) print(_pos_tags) save_json(sv_file, {'pos_tags': pos_tags_dict})
def __init__(self): self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger', self.modeldir + '/postagger/stanford-postagger.jar') self._stemmer = nltk.SnowballStemmer("english") self._stopwords = stopword(self.stopword_path) self._type_words = self._set_type_words() self._sentiment = self._get_sentiment()
def __init__(self, text): reload(sys) sys.setdefaultencoding('utf8') self.text = text root = os.path.dirname(os.path.realpath(__file__)) os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar" os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/" _path_to_model = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = root + '/stanford-postagger/stanford-postagger.jar' self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar) self.sentences = sent_tokenize(text.encode("utf-8")) self.words = word_tokenize(text.encode("utf-8")) self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) )) #cleanWords = self.cleanWords() #self.tags = self.stringifyTuples(self.stanford.tag( cleanWords )) #print self.cleanWords() self.taggedBigrams = self.ngramsAndTags(2)
def __init__(self, tagger): """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger.""" self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar) if tagger == POSTagger.STAN: self.tagger_name = POSTagger.STAN self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-hgc.tagger") elif tagger == POSTagger.SFT: self.tagger_name = POSTagger.SFT self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-fast.tagger") elif tagger == POSTagger.TT: self.tagger_name = POSTagger.TT self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards elif tagger == POSTagger.SPACY: self.tagger_name = POSTagger.SPACY self.__tagger = spacy.load('de') else: raise Exception("Wrong tagger parameter.")
class String2POSNGramsList(String2TokenList): def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'): # Other Taggers: # 1. 'english-bidirectional-distsim.tagger' # 2. 'english-left3words-distsim.tagger' super(String2POSNGramsList, self).__init__() # N-Grams size self.n = n # Tagger Class Selection... See detail in Stanford Tagger documentation. self.tagger_cls = tagger_cls # Getting the Stanford tagger instance. self.spt = StanfordPOSTagger(self.tagger_cls) # self.spt = CoreNLPPOSTagger(url='http://localhost:9000') self.spt.java_options = '-mx10g' @property def N(self): return self.n @N.setter def N(self, value): self.n = value @property def Tagger_cls(self): return self.n @Tagger_cls.setter def Tagger_cls(self, value): self.tagger_cls = value def terms_lst(self, text): # Getting the Analysed list of tokens. analyzed_terms_lst = self.token_lst(text) # Tagging the Analyzed terms list and getting the tags list as terms. pos_tags = [pos for t, pos in self.spt.tag(analyzed_terms_lst)] # Constructing the Words N-Grams List analyzed_terms_lst = [ " ".join(pos_tags[i: i+self.n]) for i in range(len(pos_tags) - self.n + 1) ] return analyzed_terms_lst
def get_pos_sentence(sentences_spans,pos_vocab): """ Get POS tags for each sentence. (needed to build end2end system) :param start: :param end: :return: """ #raw_dir_simple = read.read_from_json('test/test_dir_simple') #### in folder data/ #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples') #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple') #raw_dir_simple = ["NYT19980206.0466"] english_postagger = StanfordPOSTagger( StandforParser, #### in folder data/ StandforParser_jar) #### in folder data/ english_postagger.java_options = '-mx8000m' pos_sentences = list() for sent_span in sentences_spans: print(sent_span[0]) text = nltk.word_tokenize(sent_span[0]) text_pos = english_postagger.tag(text) #####StanfordPnOSTagger failed to tag the underscore, see https://github.com/nltk/nltk/issues/1632 if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues index = 0 for token in text_pos: # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) # text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] == "``" and text[index] not in sent_span[0]: text_pos[index] = ["\"", "``"] if text[index] ==token[0] and token[0] == "''" and text[index] not in sent_span[0]: text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] in ['{','(','['] : text_pos[index] = [token[0],"("] if text[index] == token[0] and token[0] in ['}',')',']']: text_pos[index] = [token[0],")"] pos_vocab[token[1]]+=1 index+=1 pos_sentences.append(text_pos) return pos_sentences,pos_vocab
def __init__(self, pos_model, stanford_tagger, java_path): """ Creates a POSTagSelector instance. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. """ os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
def __init__(self, condprob_model, pos_model, stanford_tagger, java_path): """ Creates a POSTagSelector instance. @param condprob_model: Path to a binary conditional probability model. For instructions on how to create the model, please refer to the LEXenstein Manual. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. """ os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) self.model = pickle.load(open(condprob_model, 'rb'))
def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'): # Other Taggers: # 1. 'english-bidirectional-distsim.tagger' # 2. 'english-left3words-distsim.tagger' super(String2POSNGramsList, self).__init__() # N-Grams size self.n = n # Tagger Class Selection... See detail in Stanford Tagger documentation. self.tagger_cls = tagger_cls # Getting the Stanford tagger instance. self.spt = StanfordPOSTagger(self.tagger_cls) # self.spt = CoreNLPPOSTagger(url='http://localhost:9000') self.spt.java_options = '-mx10g'
def __init__(self, text): reload(sys) sys.setdefaultencoding('utf8') self.text = text root = os.path.dirname(os.path.realpath(_file_)) os.environ["STANFORD_PARSER"] = root+ os.environ["STANFORD_MODELS"] = root+ _path_to_model = root + '' _path_to_jar = root + '' self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar) self.sentences = sent_tokenize(text.encode("utf-8")) self.words = word_tokenize(text.encode("utf-8")) self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower()))) #cleanWords self.taggedBigrams = self.ngramsAndTags(2)
def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'): """ Creates an instance of the WordVectorSelector class. @param vector_model: Path to a binary word vector model. For instructions on how to create the model, please refer to the LEXenstein Manual. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. @param pos_type: The type of POS tags with which the model's words are annotated, if any. Values supported: none, treebank, paetzold """ self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True) self.pos_type = pos_type os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(pos_model, stanford_tagger)
class POSTagSelector: def __init__(self, pos_model, stanford_tagger, java_path): """ Creates a POSTagSelector instance. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. """ os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) def selectCandidates(self, substitutions, victor_corpus): """ Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. @param substitutions: Candidate substitutions to be filtered. It can be in two formats: A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. Example: substitutions['perched'] = {'sat', 'roosted'} A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] @param victor_corpus: Path to a corpus in the VICTOR format. For more information about the file's format, refer to the LEXenstein Manual. @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. """ selected_substitutions = [] substitution_candidates = [] if isinstance(substitutions, list): substitution_candidates = substitutions elif isinstance(substitutions, dict): void = VoidSelector() substitution_candidates = void.selectCandidates(substitutions, victor_corpus) else: print('ERROR: Substitutions are neither a dictionary or a list!') return selected_substitutions #Read VICTOR corpus: lexf = open(victor_corpus) sents = [] targets = [] heads = [] words = set([]) c = -1 for line in lexf: c += 1 data = line.strip().split('\t') sent = data[0].strip().split(' ') target = data[1].strip() head = int(data[2].strip()) sents.append(sent) targets.append(target) heads.append(head) words.update(set(substitution_candidates[c])) lexf.close() #Tag sentences: tagged_sents = self.tagger.tag_sents(sents) #Tag words: words = list(words) words_sents = [[w] for w in words] tagged_words = self.tagger.tag_sents(words_sents) word_to_tag = {} for i in range(0, len(words)): word_to_tag[words[i]] = tagged_words[i][0][1] for i in range(0, len(sents)): target = targets[i] head = heads[i] target_pos = str(tagged_sents[i][head][1]) candidates = [] candidates = set(substitution_candidates[i]) candidates = self.getCandidatesWithSamePOS(candidates, word_to_tag, target_pos) selected_substitutions.append(candidates) lexf.close() return selected_substitutions def getTargetPOS(self, sent, target, head): pos_data = [] try: pos_data = nltk.pos_tag(sent) return pos_data[head][1] except UnicodeDecodeError: try: pos_data = nltk.pos_tag(target) return pos_data[0][1] except UnicodeDecodeError: return 'None' def getCandidatesWithSamePOS(self, candidates, word_to_tag, target_pos): result = set([]) for candidate in candidates: if candidate in word_to_tag: ctag = word_to_tag[candidate] if ctag==target_pos: result.add(candidate) return result def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): """ Saves a set of selected substitutions in a file in VICTOR format. @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. @param substitutions: The vector of substitutions selected for the VICTOR corpus. @param output_path: The path in which to save the resulting VICTOR corpus. @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. """ o = open(output_path, 'w') f = open(victor_corpus) for subs in substitutions: data = f.readline().strip().split('\t') sentence = data[0].strip() target = data[1].strip() head = data[2].strip() newline = sentence + '\t' + target + '\t' + head + '\t' for sub in subs: newline += '0:'+sub + '\t' o.write(newline.strip() + '\n') f.close() o.close()
# /usr/local/bin/python # coding: latin-1 import nltk, string, os from random import randint from nltk.tag.stanford import StanfordPOSTagger os.environ['CLASSPATH'] = '../ressources/standforPOS/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = '../ressources/standforPOS/models' st = StanfordPOSTagger('german-hgc.tagger') #tagset: http://www.coli.uni-saarland.de/projects/sfb378/negra-corpus/stts.asc ############################## # Text importieren ############################## f = open("../ressources/reiseberichtIndien.txt") raw_text = f.read() start_bookmark = raw_text.find("Erstes Kapitel") end_bookmark = raw_text.rfind("Im Verlag von R") text = raw_text[start_bookmark:end_bookmark] ############################## # Tokenization des Textes
elif tag.startswith('J'): result = 'J' elif tag.startswith('W'): result = 'W' elif tag.startswith('PRP'): result = 'P' else: result = tag.strip() return result model = '/export/data/ghpaetzold/benchmarking/lexmturk/scripts/evaluators/stanford-postagger-full-2015-04-20/models/english-bidirectional-distsim.tagger' tagger = '/export/data/ghpaetzold/benchmarking/lexmturk/scripts/evaluators/stanford-postagger-full-2015-04-20/stanford-postagger.jar' java = '/usr/bin/java' os.environ['JAVAHOME'] = java tagger = StanfordPOSTagger(model, tagger, java_options='-Xmx6g') f = open('ratings.txt') sents1 = [] sents2 = [] heads1 = [] heads2 = [] for line in f: data = line.strip().split('\t') word1 = data[1].strip() word2 = data[3].strip() sent1 = data[5].strip() newsent1 = '' tokens = sent1.split(' ') index1 = -1
class WordVectorSelector: def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'): """ Creates an instance of the WordVectorSelector class. @param vector_model: Path to a binary word vector model. For instructions on how to create the model, please refer to the LEXenstein Manual. @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param stanford_tagger: Path to the "stanford-postagger.jar" file. The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml @param java_path: Path to the system's "java" executable. Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. @param pos_type: The type of POS tags with which the model's words are annotated, if any. Values supported: none, treebank, paetzold """ self.model = gensim.models.word2vec.Word2Vec.load_word2vec_format(vector_model, binary=True) self.pos_type = pos_type os.environ['JAVAHOME'] = java_path self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) def selectCandidates(self, substitutions, victor_corpus, proportion=1.0, proportion_type='percentage', stop_words_file=None, window=99999, onlyInformative=False, keepTarget=False, onePerWord=False): """ Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. @param substitutions: Candidate substitutions to be filtered. It can be in two formats: A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. Example: substitutions['perched'] = {'sat', 'roosted'} A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] @param victor_corpus: Path to a corpus in the VICTOR format. For more information about the file's format, refer to the LEXenstein Manual. @param proportion: Percentage of substitutions to keep. If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1. If proportion_type is set to "integer", then this parameter must be an integer number. @param proportion_type: Type of proportion to be kept. Values supported: percentage, integer. @param stop_words_file: Path to the file containing stop words of the desired language. The file must contain one stop word per line. @param window: Number of tokens around the target complex sentence to consider as its context. @param onlyInformative: If True, only content words are considered as part of the complex word's context, such as nouns, verbs, adjectives and adverbs. @param keepTarget: If True, the complex target word is also included as part of its context. @param onePerWord: If True, a word in the complex word's context can only contribute once to its resulting word vector. @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. """ #Initialize selected substitutions: selected_substitutions = [] #Read stop words: stop_words = set([]) if stop_words_file != None: stop_words = set([word.strip() for word in open(stop_words_file)]) #Configure input: substitution_candidates = [] if isinstance(substitutions, list): substitution_candidates = substitutions elif isinstance(substitutions, dict): void = VoidSelector() substitution_candidates = void.selectCandidates(substitutions, victor_corpus) else: print('ERROR: Substitutions are neither a dictionary or a list!') return selected_substitutions #Parse sentences: lexf = open(victor_corpus) sents = [line.strip().split('\t')[0].strip().split(' ') for line in lexf] lexf.close() tagged_sents = self.tagger.tag_sents(sents) #Transform them to the right format: if self.pos_type=='paetzold': transformed = [] for sent in tagged_sents: tokens = [] for token in sent: tokens.append((token[0], getGeneralisedPOS(token[1]))) transformed.append(tokens) tagged_sents = transformed #Rank candidates: c = -1 lexf = open(victor_corpus) for line in lexf: c += 1 data = line.strip().split('\t') sent = data[0].strip() target = data[1].strip() head = int(data[2].strip()) pos_tags = tagged_sents[c] target_pos = pos_tags[head][1] target_vec = self.getSentVec(sent, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tags) candidates = substitution_candidates[c] candidate_dists = {} for candidate in candidates: candidate_vec = self.getWordVec(candidate, target_pos) try: candidate_dists[candidate] = cosine(candidate_vec, target_vec) except ValueError: candidate_dists = candidate_dists final_candidates = self.getFinalCandidates(candidate_dists, proportion, proportion_type) selected_substitutions.append(final_candidates) lexf.close() return selected_substitutions def getSentVec(self, sentence, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tokens): informative_tags = set([]) if onlyInformative: if self.pos_type=='treebank': informative_tags = set(['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS']) if self.pos_type=='paetzold': informative_tags = set(['N', 'V', 'J', 'R']) tokens = sentence.split(' ') valid_tokens = [] if keepTarget: valid = tokens[head].strip() if self.pos_type!='none': valid += '|||' + pos_tokens[head][1] valid_tokens.append(valid) if head>0: for i in range(max(0, head-window), head): if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags: if tokens[i] not in stop_words: valid = tokens[i] if self.pos_type!='none': valid += '|||' + pos_tokens[i][1] valid_tokens.append(valid) if head<len(tokens)-1: for i in range(head+1, min(len(tokens), head+1+window)): if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags: if tokens[i] not in stop_words: valid = tokens[i] if self.pos_type!='none': valid += '|||' + pos_tokens[i][1] valid_tokens.append(valid) if onePerWord: valid_tokens = list(set(valid_tokens)) result = [] for token in valid_tokens: if len(result)==0: try: result = self.model[token] except Exception: result = [] else: try: result = np.add(result, self.model[token]) except Exception: result = result result = result/float(len(valid_tokens)) return result def getWordVec(self, candidate, target_pos): cand = None if self.pos_type!='none': cand = candidate + '|||' + target_pos else: cand = candidate result = np.array([]) try: result = self.model[cand] except Exception: pass return result def getFinalCandidates(self, candidate_dists, proportion, proportion_type): result = sorted(list(candidate_dists.keys()), key=candidate_dists.__getitem__) if proportion_type=='percentage': return result[0:max(1, int(proportion*float(len(result))))] elif proportion_type=='integer': if proportion>=len(result): return result else: return result[0:max(1, int(proportion))] else: print('Unrecognized proportion type.') return result def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): """ Saves a set of selected substitutions in a file in VICTOR format. @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. @param substitutions: The vector of substitutions selected for the VICTOR corpus. @param output_path: The path in which to save the resulting VICTOR corpus. @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. """ o = open(output_path, 'w') f = open(victor_corpus) for subs in substitutions: data = f.readline().strip().split('\t') sentence = data[0].strip() target = data[1].strip() head = data[2].strip() newline = sentence + '\t' + target + '\t' + head + '\t' for sub in subs: newline += '0:'+sub + '\t' o.write(newline.strip() + '\n') f.close() o.close()
## This code extracts the features for several glosses and stores it in two text files to be fed to evaluation. py or predictGoodness.py ## import everything needed from sklearn.metrics.pairwise import cosine_similarity from scipy import spatial from nltk.parse.stanford import StanfordParser from nltk.tag.stanford import StanfordPOSTagger import string from pattern.de import singularize import subprocess import os ## set variables parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz") st = StanfordPOSTagger('german-dewac.tagger') featuresPhrases = [] finalRatings = [] count=0 path = '/home/hanna/Documents/SMOR/' ## read in the word frequencies from DeReWo derewo = open('derewo-v-ww-bll-320000g-2012-12-31-1.0.txt') freqWo= [] freqNo= [] for lines in derewo: lines = lines.strip() parts = lines.split(" ") freqWo.append(parts[0].lower()) freqNo.append(int(float(parts[1])))
def __init__(self): self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar')
class SenticParser: def __init__(self): self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar') def TaggedSentenceSlashForm(self, sentence ): #print sentence.split() Tagged = self.st.tag(sentence.split()) TaggedSentence = "" for i in Tagged: TaggedSentence = TaggedSentence+"/".join(i)+" " #print TaggedSentence return TaggedSentence def TaggedSentence(self, sentence ): Tagged = self.st.tag(sentence.split()) return Tagged def FindStemmedVerb(self, word): st = LancasterStemmer() StemmedVerb = st.stem(word) dic = enchant.Dict("en_US") if( dic.check(StemmedVerb) ): return StemmedVerb else: return StemmedVerb+"e" def FindSplit(self, sentence, TaggedSentence): TokenizedSentence = nltk.word_tokenize(sentence) SplitList = [] SentAdded = "" split = 0 #print TaggedSentence for i in range(len(TaggedSentence)): if TaggedSentence[i][1].startswith("VB"): SplitList.append(SentAdded) try: if (TaggedSentence[i+1][1].startswith("VB")): SentAdded = "" else: SplitList.append(SentAdded) SentAdded = TaggedSentence[i][0]+" " # print "split" except: SplitList.append(TaggedSentence[i][0]) else: #print SentAdded SentAdded = SentAdded + TokenizedSentence[i] + " " SplitList.append(SentAdded) Str_list = filter(None, SplitList) Str_list = list(set(Str_list)) ''' for i in range(len(Str_list)): Str_list[i] = Str_list[i][:-1].translate(string.maketrans("",""), string.punctuation) ''' return Str_list
''' Created on Mar 11, 2016 @author: zhongzhu ''' import os from nltk.parse.stanford import StanfordDependencyParser from nltk.parse.stanford import StanfordParser from nltk.tag import StanfordNERTagger from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger('english-bidirectional-distsim.tagger') st.tag('What is the airspeed of an unladen swallow ?'.split()) st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
class Parser: def __init__(self): self.MatchList = [] self.ConceptMatches = [] self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar') def SyntacticMatch(self, concept1, concept2 ): # Checks for syntactic similarity. Checks for matching words between two concepts. TaggedConcept1 = self.st.tag(nltk.word_tokenize(concept1)) TaggedConcept2 = self.st.tag(nltk.word_tokenize(concept2)) print TaggedConcept1 print TaggedConcept2 flag = 0 for i in TaggedConcept1: for j in TaggedConcept2: if (i == j): if i[1].startswith("NN"): flag = 1 if ( flag == 1): return True else: return False def FindBigrams(self, concept): # Finds All Bigrams associated with the concept #sentence = concept.split(" ") # Splits the Given concept into Bigrams e.g) "a very special christmas gift" gets split as ["a very", "very special", "special christmas", "christmas gift"] sentence = self.st.tag(nltk.word_tokenize(concept)) print sentence Bigrams = [] for i in range(len(sentence) - 1): if ( sentence[i][1] == "JJ" and sentence[i+1][0] in stopwords.words('english') ): # If the bigram is [ adj + stopword ] , ignore continue # bigrams like "a very" are ignored elif ( sentence[i][0] in stopwords.words('english') and sentence[i+1][0] in stopwords.words('english') ): # If the bigram is [ adj + stopword ] , ignore continue elif ( sentence[i+1][1] == "JJ" and sentence[i][0] in stopwords.words('english') ): # If the bigram is [ stopword + adj ] , ignore continue # bigrams like "amazingly a" is ignored elif ( sentence[i][1] == "JJ" and sentence[i+1][1].startswith("NN") ): # If the bigram is [ adj + concept ] , then include [adj + concept] and [concept] to the list Bigrams.append(sentence[i+1][0]) # e.g) "special christmas" --> concepts extracted will be "special christmas" and "christmas" are added Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0]) elif ( sentence[i][0] in stopwords.words("english") and sentence[i+1][1].startswith("NN") ): # If the bigram is [ stopword + concept ], then inlcude only the concept w/ and w/o the concept Bigrams.append(sentence[i+1][0]) # e.g) "the christmas" --> concepts that will be extracted is "christmas" , "the christmas" Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0]) elif ( sentence[i][1].startswith("NN") and sentence[i+1][1] == "JJ" ): # If the bigram ends with adjective , then ignore the adjective. Bigrams.append(sentence[i][0]) # e.g) "present amazing" --> concept that will be extracted is "present" elif ( sentence[i][1].startswith("NN") and sentence[i+1][0] in stopwords.words("english")): # If the bigram ends with a stopword , then ignore the stopword Bigrams.append(sentence[i][0]) # e.g) "christmas the" --> concept that will be extracted is "christmas" else: Bigrams.append(sentence[i][0]+ " "+ sentence[i+1][0]) print Bigrams return Bigrams
class NltkHelper: def __init__(self, text): reload(sys) sys.setdefaultencoding('utf8') self.text = text root = os.path.dirname(os.path.realpath(__file__)) os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar" os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/" _path_to_model = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = root + '/stanford-postagger/stanford-postagger.jar' self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar) self.sentences = sent_tokenize(text.encode("utf-8")) self.words = word_tokenize(text.encode("utf-8")) self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) )) #cleanWords = self.cleanWords() #self.tags = self.stringifyTuples(self.stanford.tag( cleanWords )) #print self.cleanWords() self.taggedBigrams = self.ngramsAndTags(2) #print self.words #print self.cleanWords() #print "Bigrams --> ", self.taggedBigrams #print "Tags --> ", self.findTags() #print (nouns) def personal_names(self): output = [] #(('reports', 'NNS'), ('claim', 'VBP')) for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isPersonalName( tag1 ) and self.isPersonalName( tag2 ): output.append( "{0} {1}".format(word1, word2) ) return output def isPersonalName(self, tag): return tag == "NNP" or tag == "FW" def preprocessTitle(self): output = '' for taggedWord in self.tags: word = taggedWord[0] tag = taggedWord[1] if self.isPersonalName(tag): output = "{0} {1}".format(output, word.title()) else: output = "{0} {1}".format(output, word.lower()) return output def ngramsAndTags(self, n): output = [] for i in range(len(self.tags)-n+1): gram = (self.tags[i],) for j in range(i+1, i+n): gram += ( self.tags[j], ) output.append( gram ) return output def sortFrequencies( self, ngram ): return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True) def findTags(self): #pattern = [("AJ", NOUN/S/FWS), (FW, FW), NOUN, NOUN] output = [] #(('reports', 'NNS'), ('claim', 'VBP')) for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ): output.append( "{0} {1}".format(word1, word2) ) return output def isAdj(self, tag): return tag=='JJ' def isNounOrForeignWord(self, tag): nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] return tag in nouns """ def bigramsList(self): pass """ def stringifyList(self, list): output = [] for tag in list: output.append( str(tag.encode('utf-8')) ) return output def stringifyTuples(self, tuples): output = [] for tag in tuples: output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) ) return output """ returns list of tuples of tagged words in text """ def analyze(self): output = [] for sentence in self.sentences: taggedWords = self.stanford.tag( word_tokenize( sentence.lower() ) ) output.append(taggedWords) return self.stringifyTuples(taggedWords) """ returns list of nouns and foreign words """ def filterNounsInText(self): output = set() nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] for sentence in self.sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) return self.stringifyList( list(output) ) def cleanWords(self): input = '' for item in self.words: input = "{0} {1}".format(input, item) input = re.sub('\n+', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input) input.decode('ascii', 'ignore') input = input.split(" ") cleanInput = [] for item in input: item = item.strip( string.punctuation ) if len(item)>1 or (item.lower()=='a' or item.lower()=='i'): cleanInput.append( item ) return cleanInput def bigramNouns(self, text): nouns = self.filterNouns(text) def isTagNounOrForeignWord(self, word): output = False nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] taggedWords = self.stanford.tag( word.lower() ) for item in taggedWords: if item[1] in nouns: output = True break return output @staticmethod def filterNouns(self, input): output = set() nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] sentences = sent_tokenize(input) for sentence in sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) nList = list(output) return self.stringifyTuples(nList) @staticmethod def define( self, word ): definitions = [] try: synsets = wn.synsets(word) for synset in synsets: definitions.append (synset.definition()) except ValueError: print "Cannot define '{0}'".format(word) return definitions def sentenceExamples( self, noun): output = [] try: synsets = wn.synsets(noun) for synset in synsets: examples = synset.examples() for example in examples: output.append( example ) except ValueError, AttributeError: print "Cannot find any example for '{0}'".format(noun) return output
os.environ['STANFORD_MODELS'] = \ 'C:/stanford_data/stanford-parser-3.5.2-models.jar' parser = stanford.StanfordParser(model_path= \ "C:/stanford_data/englishPCFG.ser.gz") parsed_sentences = parser.raw_parse( \ (my_sentence)) for i in parsed_sentences: for k in i: print(k) # GUI for line in parsed_sentences: for sentence in line: sentence.draw() sys.exit() st = StanfordPOSTagger(r'C:/stanford_data/english-bidirectional-distsim.tagger',r'C:/stanford_data/stanford-postagger.jar') bobo = st.tag(my_sentence.split()) print(bobo) for i in bobo: print(i)
from nltk.tag.stanford import StanfordPOSTagger import nltk import os os.environ['CLASSPATH'] = "/home/vishesh/Downloads/stanford-postagger-full-2015-12-09/" english_postagger = StanfordPOSTagger('models/english-bidirectional-distsim.tagger') print english_postagger.tag(nltk.word_tokenize('this is stanford postagger in nltk for python users')) fo = open('europarl-v7.de-en.de','r') data = fo.read() fo.close() fw = open('europarl_tags_testing.txt','w') data = data.decode('utf-8') data = data.split('\n') #tokens = data.split() #print len(tokens) #print 'Tagging...' german_postagger = StanfordPOSTagger('/home/vishesh/Documents/NLP/postagger/models/german-fast-caseless.tagger') for i in range(10000,11500): tokens = nltk.word_tokenize(data[i]) tags = german_postagger.tag(tokens)
from gensim import matutils from math import log from collections import Counter, defaultdict from sklearn.decomposition import NMF from nltk.tokenize import TweetTokenizer import nltk import re import os path = '/home/kaminem64/stanford' os.environ['CLASSPATH'] = '%s/stanford-postagger-full-2015-04-20/stanford-postagger.jar:%s/stanford-ner-2015-04-20/stanford-ner.jar:%s/stanford-parser-full-2015-04-20/stanford-parser.jar:%s/stanford-parser-full-2015-04-20/stanford-parser-3.6.0-models.jar' %(path, path, path, path) os.environ['STANFORD_MODELS'] = '%s/stanford-postagger-full-2015-04-20/models:%s/stanford-ner-2015-04-20/classifiers' %(path, path) from nltk.tag.stanford import StanfordPOSTagger stanford_pos_tag = StanfordPOSTagger('english-bidirectional-distsim.tagger') import xlsxwriter workbook = xlsxwriter.Workbook('topic_modeling.xlsx') worksheet = workbook.add_worksheet() row_num = 0 worksheet.write(row_num, 0, 'store_app_id') worksheet.write(row_num, 1, 'name') worksheet.write(row_num, 2, 'start_date') worksheet.write(row_num, 3, 'end_date') worksheet.write(row_num, 4, 'release_note') worksheet.write(row_num, 5, 'topics') app_ids = [307906541]#, 282614216, 383298204, 421254504, 509993510, ] previous_date = None
class Parser(object): modeldir = os.path.abspath(BASE_DIR + "/weiss/planner/models/") stopword_path = modeldir + "/english.stp" def __init__(self): self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger', self.modeldir + '/postagger/stanford-postagger.jar') self._stemmer = nltk.SnowballStemmer("english") self._stopwords = stopword(self.stopword_path) self._type_words = self._set_type_words() self._sentiment = self._get_sentiment() def _get_sentiment(self): sentiment = {} for line in open(self.modeldir + "/AFINN.txt"): word, score = line.split('\t') sentiment[word] = int(score) return sentiment def calculate_sentiment(self, query): tokens = nltk.word_tokenize(query) score = 0 for token in tokens: if token in self._sentiment: score += self._sentiment[token] return score def entity_recognition(self, query, arguments): """Parse query and extract keywords This function is called in planner Args: query: query needs to be parsed arguments: info needs to be updated """ tokens = nltk.word_tokenize(query) tags = self._postagger.tag(tokens) tuples = [] for tag in tags: if tag[0] in self._stopwords: continue stemmed = self._stemmer.stem(tag[0]) if stemmed in self._type_words['movie']: continue if stemmed in self._type_words['article']: continue if stemmed in self._type_words['restaurant']: continue if tag[1][:2] == 'NN' or tag[1][:2] == 'JJ': tuples.append(tag[0]) if len(tuples) > 0: arguments['keywords'] = tuples logger.info("Here are the keywords: %s" % arguments['keywords']) def _set_type_words(self): """Initialize synonymy words of movie, article and restaurant This function is called during initialization Return: A dictionary, key: movie, article, restaurant, value: their synonymy words """ topic = {} movie = ['cinema', 'show', 'film', 'picture', 'cinematograph', 'videotape', 'flick', 'pic', 'cine', 'cinematics', 'photodrama', 'photoplay', 'talkie', 'flicker', 'DVD', 'movie'] article = ['report', 'announcement', 'story', 'account', 'newscast', 'headlines', 'press', 'communication', 'talk', 'word', 'communique', 'bulletin', 'message', 'dispatch', 'broadcast', 'statement', 'intelligence', 'disclosure', 'revelation', 'gossip', 'dispatch', 'news', 'article'] restaurant = ['bar', 'cafeteria', 'diner', 'dining', 'saloon', 'coffeehouse', 'canteen', 'chophouse', 'drive-in', 'eatery', 'grill', 'lunchroom', 'inn', 'food', 'pizzeria', 'hideaway', 'cafe', 'charcuterie', 'deli', 'restaurant'] for m in movie: topic.setdefault('movie', set([])) topic['movie'].add(self._stemmer.stem(m)) for a in article: topic.setdefault('article', set([])) topic['article'].add(self._stemmer.stem(a)) for r in restaurant: topic.setdefault('restaurant', set([])) topic['restaurant'].add(self._stemmer.stem(r)) return topic def type_recognition(self, query, arguments): """Identity the type of the topic: movie, article or restaurant This is called in planner Args: query: query needs to be parsed arguments: info needs to be updated """ tokens = nltk.word_tokenize(query) first = self._stemmer.stem(tokens[0]) last = self._stemmer.stem(tokens[-1]) lastsecond = self._stemmer.stem(tokens[-2]) if len(tokens) > 1 else "toy" if (first in self._type_words['article'] or last in self._type_words['article'] or lastsecond in self._type_words['article']): arguments['tid'] = Type.News elif (first in self._type_words['restaurant'] or last in self._type_words['restaurant'] or lastsecond in self._type_words['restaurant']): arguments['tid'] = Type.Restaurant elif (first in self._type_words['movie'] or last in self._type_words['movie'] or lastsecond in self._type_words['movie']): arguments['tid'] = Type.Movie else: arguments['tid'] = Type.Unknown @staticmethod def _string_to_idx(number): if number == 'first' or number == 'one': return 0 if number == 'second' or number == 'two': return 1 if number == 'third' or number == 'three': return 2 if number == 'fourth' or number == 'four': return 3 if number == 'fifth' or number == 'five': return 4 @staticmethod def keyword_matching(arguments, entities): words = arguments['keywords'] phonics = set([]) overlap = [] for w in words: phonics.add(fuzzy.nysiis(w)) for i in xrange(0, len(entities)): entity_name = nltk.word_tokenize(entities[i].name) entity_phonics = set([]) for word in entity_name: entity_phonics.add(fuzzy.nysiis(word)) common = len(phonics & entity_phonics) / len(entity_phonics) if common == 1: arguments['idx'] = i return overlap.append(common) arguments['idx'] = overlap.index(max(overlap)) def find_number(self, query, arguments, entities): tokens = nltk.word_tokenize(query) tags = self._postagger.tag(tokens) last = query.find('last') # Edge case, "first" cannot be tagged correctly if len(query.split(" ")) <= 3 and query.find('first') != -1: arguments['idx'] = 0 return number = None for t in tags: if t[1] == 'JJ' and t[0][-2:] in set(['th', 'nd', 'st', 'rd']): number = t[0] break elif t[1] == 'CD' and t[0]: number = t[0] if number.isdigit() and int(number) < 6: arguments['idx'] = int(number) - 1 return break if number is not None: if last == -1: arguments['idx'] = self._string_to_idx(number) else: arguments['idx'] = len(entities) - self._string_to_idx(number) - 1
class NLTKHelper(object): """docstring for NLTKHelper""" def __init__(self, text): reload(sys) sys.setdefaultencoding('utf8') self.text = text root = os.path.dirname(os.path.realpath(_file_)) os.environ["STANFORD_PARSER"] = root+ os.environ["STANFORD_MODELS"] = root+ _path_to_model = root + '' _path_to_jar = root + '' self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar) self.sentences = sent_tokenize(text.encode("utf-8")) self.words = word_tokenize(text.encode("utf-8")) self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower()))) #cleanWords self.taggedBigrams = self.ngramsAndTags(2) #print self.words def personal_names(self): output = [] for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isPersonalName(tag1) and self.isPersonalName(tag2): output.append("{0} {1}".format(word1, word2)) return output def isPersonalName(self, tag): return tag == "NNP" or tag == "FW" def preprocessTitle(self): output = '' for taggedWord in self.tags: word = taggedWord[0] tag = taggedWord[1] if self.isPersonalName(tag): output = "{0} {1}".format(output, word.title()) else: output = "{0} {1}".format(output, word.lower()) return output def ngramsAndTags(self, n): output = [] for i in range(len(self.tags)-n+1): gram = (self.tags[i],) for j in range(i+1, i+n): gram +=(self.tags[j], ) output.append(gram) return output def sortFrequencies(self, ngram): return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True) def findTags(self): output = [] for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ): output.append( "{0} {1}".format(word1, word2) ) return output def isAdj(self, tag): return tag=='JJ' def isNounOrForeignWord(self, tag): nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] return tag in nouns def stringifyList(self, list): output = [] for tag in list: output.append( str(tag.encode('utf-8')) ) return output def stringifyTuples(self, tuples): output = [] for tag in tuples: output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) ) return output #returns list of tuples of tagged words in text def analyze(self): output = [] for sentence in self.sentences: taggedWords = self.stanford.tag(word_tokenize(sentence.lower())) output.append(taggedWords) return self.stringifyTuples(taggedWords) def filterNounsInText(self): output = set()nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] for sentence in self.sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) return self.stringifyList( list(output) ) def cleanWords(self): input = '' for item in self.words: input = "{0} {1}".format(input, item) input = re.sub('\n+', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input) input.decode('ascii', 'ignore') input = input.split(" ") cleanInput = [] for item in input: item = item.strip( string.punctuation ) if len(item)>1 or (item.lower()=='a' or item.lower()=='i'): cleanInput.append( item ) return cleanInput def bigramNouns(self, text): nouns = self.filterNouns(text) def isTagNounOrForeignWord(self, word): output = False nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] taggedWords = self.stanford.tag( word.lower() ) for item in taggedWords: if item[1] in nouns: output = True break return output @staticmethod def filterNouns(self, input): output = set() nouns = ['NN', 'NNS', 'NNPS', 'FW'] sentences = sent_tokenize(input) for sentence in sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) nList = list(output) return self.stringifyTuples(nList) @staticmethod def define(self, word): definitions = [] try: synsets = wn.synsets(word) for synset in synsets: definitions.append(synset.definition()) except ValueError: print "Cannot define '{0}'".format(word) except definitions
# -*- coding: utf-8 -*- import nltk from nltk.tag.stanford import StanfordPOSTagger from nltk.tokenize import word_tokenize #the path where you have downloaded and unziped the full parser. sp_dir = '/home/sarah/postagger/' english_model = sp_dir + 'models/english-bidirectional-distsim.tagger' chinese_model = sp_dir + 'models/chinese-distsim.tagger' jar_path = sp_dir + 'stanford-postagger.jar' #testing the english POS tagger print "For the English model" st_eng = StanfordPOSTagger(model_filename = english_model, path_to_jar = jar_path) eng_sent = 'This is Stanford postagger in nltk for Python users.' print eng_sent eng_tokens = word_tokenize(eng_sent) eng_tagged = st_eng.tag(eng_tokens) for i in eng_tagged: print i #testing for the chinese POS tagger print "\n\nFor the Chinese model" st_chi = StanfordPOSTagger(model_filename = chinese_model, path_to_jar = jar_path,encoding = 'utf-8') chi_sent = '这 是 在 Python 环境 中 使用 斯坦福 词性 标 器' print chi_sent chi_tokens = word_tokenize(chi_sent) chi_tagged = st_chi.tag(chi_tokens) for i in chi_tagged: print i #print st_chi.tag('这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'.split())
from nltk import pos_tag,word_tokenize #from Utils import getQues #txt=getQues() #txt="benim adim yahya" from nltk.tag.stanford import StanfordPOSTagger txt="i am dentist" tgr=StanfordPOSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar') print tgr.tag(word_tokenize(txt))