def stanford_corenlp_filter(sent): from nltk.tag.stanford import POSTagger posTagger = POSTagger('/Users/gt/Downloads/' 'stanford-postagger-2013-06-20/models/' 'wsj-0-18-bidirectional-nodistsim.tagger', '/Users/gt/Downloads/stanford-postagger-2013-06-20' '/stanford-postagger-3.2.0.jar',encoding=encoding) b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' return filtered_sent
def stanford_corenlp_filter(sent): from nltk.tag.stanford import POSTagger posTagger = POSTagger( '/Users/gt/Downloads/' 'stanford-postagger-2013-06-20/models/' 'wsj-0-18-bidirectional-nodistsim.tagger', '/Users/gt/Downloads/stanford-postagger-2013-06-20' '/stanford-postagger-3.2.0.jar', encoding=encoding) b1, b2 = sent.split(blockSeparator) b2 = b2.rstrip() b1 = b1.lower() tokens = word_tokenize(b1) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' ' #note: 1 concat stemmer(word) == stemmer(1 concat word) b2 = b2.lower() tokens = word_tokenize(b2) pos_tags = posTagger.tag(tokens) filtered_sent = ' ' for pos_t in pos_tags: if pos_t[1] in filterList: # filtered_sent += stemmer.stem(pos_t[0]) + ' ' filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' ' return filtered_sent
def nltk_stanfordpos(inpath, outfolder): """POS-Tagging French text with Stanford POS-Tagger via NLTK.""" print("\nLaunched nltk_stanfordpos.") import os import glob from nltk.tag.stanford import POSTagger for file in glob.glob(inpath): st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8") with open(file, "r", encoding="utf-8") as infile: untagged = infile.read() tagged = st.tag(untagged.split()) taggedstring = "" for item in tagged: item = "\t".join(item) taggedstring = taggedstring + str(item) + "\n" #print(taggedstring) basename = os.path.basename(file) cleanfilename = basename if not os.path.exists(outfolder): os.makedirs(outfolder) with open(os.path.join(outfolder, cleanfilename),"w") as output: output.write(taggedstring) print("Done.")
def vectorizer(tokens, w2v_db): db_path = w2v_db # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors of words. Maintain order as in document. token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words. conn.close() return unsorted_kw, token_vecs
def cleanTokens(tokens): st = POSTagger('/models/german-fast.tagger') tags = st.tag(tokens); def cleanTags(x): y = x[1] return True if re.match("NE|NN",y) and len(x[0]) > 3 else False clean_tags= filter(cleanTags,tags) #import pdb;pdb.set_trace(); def buildSentens(arr): list = [] sen ="" for i in arr: list.append(i[0]) return list #print len(clean_tags) #print clean_tags clean = buildSentens(clean_tags) return clean
def main(): st = POSTagger( "/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger", "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar", ) # st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \ # "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar") # print st.tag("Die Kinder in Bayern haben lange Ferien".split()) # return with open(sys.argv[1], "r") as f: content = f.read() sentences = re.split("\n|\.|\?", content) for s in sentences: if len(s) == 0: continue # print s pieces = st.tag(s.split()) strippedPieces = stripPieces(pieces) print " ".join(strippedPieces)
def postext_st(filename): # Opening of File path_to_raw = '/home/cyneo/Work/Scans/Text Version/' if type(filename) != str: raise IOError('Filename must be a string') # Preparing to Tokenize with open(osp.abspath(path_to_raw + filename + '.txt'), 'r', encoding='utf8') as raw: # Initialize the punkt module sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sents = [] for line in raw: sents.extend(sent_detector.tokenize(line.strip())) tokenedsents = [] # Tokenizing from nltk.tokenize.stanford import StanfordTokenizer for line in sents: tokenedsents.append(StanfordTokenizer().tokenize(line)) # Parts of Speech Tagging posSents = [] from nltk.tag.stanford import POSTagger st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger', encoding='utf8') for line in tokenedsents: # Returns a list of a list of tuples posSents.append(st.tag(line)) return posSents
def nltk_stanfordpos(inpath, outfolder): """POS-Tagging French text with Stanford POS-Tagger via NLTK.""" print("\nLaunched nltk_stanfordpos.") import os import glob from nltk.tag.stanford import POSTagger for file in glob.glob(inpath): st = POSTagger( '/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8") with open(file, "r", encoding="utf-8") as infile: untagged = infile.read() tagged = st.tag(untagged.split()) taggedstring = "" for item in tagged: item = "\t".join(item) taggedstring = taggedstring + str(item) + "\n" #print(taggedstring) basename = os.path.basename(file) cleanfilename = basename if not os.path.exists(outfolder): os.makedirs(outfolder) with open(os.path.join(outfolder, cleanfilename), "w") as output: output.write(taggedstring) print("Done.")
def createModel(): global classifierit global classifierloose global classifieryou global classifierto global classifiertheir trainingitSet = [] traininglooseSet = [] trainingyouSet = [] trainingtoSet = [] trainingtheirSet= [] st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar') for line in brown.sents(): print line tagSent = st.tag(line) print tagSent arrayOfitFeature = pos_itfeatures(tagSent) arrayOfyouFeature = pos_youfeatures(tagSent) arrayOftheirFeature = pos_theirfeatures(tagSent) arrayOflooseFeature = pos_loosefeatures(tagSent) arrayOftoFeature = pos_tofeatures(tagSent) if arrayOfitFeature: trainingitSet.extend(arrayOfitFeature) if arrayOftheirFeature: trainingtheirSet.extend(arrayOftheirFeature) if arrayOflooseFeature: traininglooseSet.extend(arrayOflooseFeature) if arrayOftoFeature: trainingtoSet.extend(arrayOftoFeature) if arrayOfyouFeature: trainingyouSet.extend(arrayOfyouFeature) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1] #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True) classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm) f = open('classifierit.pickle', 'wb') pickle.dump(classifierit, f) f.close() #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True) classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm) f = open('classifierloose.pickle', 'wb') pickle.dump(classifierloose, f) f.close() #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True) classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm) f = open('classifieryou.pickle', 'wb') pickle.dump(classifieryou, f) f.close() #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True) classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm) f = open('classifierto.pickle', 'wb') pickle.dump(classifierto, f) f.close() #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True) classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm) f = open('classifiertheir.pickle', 'wb') pickle.dump(classifiertheir, f) f.close()
def stanford_tag(sentence): ''' use stanford tagger to tag a single tokenized sentence ''' import src.experiment.path as path tagger = POSTagger(path.stanford_tagger_model_path(), path.stanford_tagger_path(), java_options='-Xmx16g -XX:MaxPermSize=256m') return tagger.tag(sentence)
def tag(segments): #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar') st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'), os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar')) tagged = [] for segment in segments: x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment))) tagged.append(x.decode('utf-8')) return tagged
def spanish_pos(text): """ Parts of speech tagger for Spanish """ text = text.encode('utf8') st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger', '/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8') pos_tagged = st.tag(text.split()) return pos_tagged
def german_pos(text): """ Parts of speech tagger for German """ text = text.encode('utf8') st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger', '/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8') pos_tagged = st.tag(text.split()) return pos_tagged
def processor(name, url, tokens, db_path, json_dir, USE_TITLE_WORDS=False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w, t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs) / 4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data, nk, iter=20, minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir, name + '.json') file_dest = open(json_path, 'w') json.dump( { 'url': url, 'vectors': token_vecs, 'keyword_frequency': unsorted_kw, 'centroids': centroids }, file_dest) file_dest.close()
class Tagger(): def __init__(self): self.st = POSTagger( os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/models/english-bidirectional-distsim.tagger'), os.path.normpath( os.path.dirname(os.path.realpath(__file__)) + '/stanford-pos/stanford-postagger.jar')) def tag(self, line): return self.st.tag(line.split())
def pos_tag(to_tag, model_path=root_path + "\\stanford-postagger-full-2013-06-20\\models\\french.tagger", jar_path=root_path + "\\stanford-postagger-full-2013-06-20\\stanford-postagger.jar"): '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file''' pos_tagger = POSTagger( model_path, jar_path, encoding='utf8' ) #create an object of class POSTagger that is encoded in UTF-8 tags = pos_tagger.tag( to_tag) #run the tagging algorithm on the tokenized raw text return tags
def tag(segments): #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar') st = POSTagger( os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'), os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar')) tagged = [] for segment in segments: x = ' '.join( nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment))) tagged.append(x.decode('utf-8')) return tagged
def main(): print "Inicio..." with open("tweets_a_procesar_v2.csv", 'rb') as csvfile: lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'") # En esta variable estan todos los tweets tweets = [] for line in lines: tweet = Tweet(line) #print tweet.spanish_text.split() tweets.append(tweet) #archivo de salida output = open("output_tagged_v2.csv", 'wb') filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'") #importando el tagger en español de Stanford NLP from nltk.tag.stanford import POSTagger st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8') #st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8') #st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8') n=0 for tweet in tweets: n+=1 print tweet.spanish_text #Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split()) tweet_tagged = st.tag((tweet.spanish_text).split()) #Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] #print tweet_tagged important_words = [] n_adj = 0 for tag in tweet_tagged: inicial = tag[1][:1] if('a' in inicial): important_words.append(tag[0]) if('r' in inicial): important_words.append(tag[0]) if('n' in inicial): important_words.append(tag[0]) if('v' in inicial): important_words.append(tag[0]) #tweet.cant_adj = n_adj tweet.tweet_tagged = tweet_tagged tweet.important_words = important_words filewriter.writerow(tweet.to_CSV()) if n % 100 == 0: print n print "Done" output.close()
class yagoScores: def __init__(self): None self.en_postagger = POSTagger('parser/models/english-bidirectional-distsim.tagger', 'parser/stanford-postagger.jar') def parse(self,text): return self.en_postagger.tag(text.split()) def get_underscoreWords(self,text): return re.findall("[a-z]+_[a-z]+", text) def findNounsSeq(self,tuples): self.noun = [] self.nouns = [] prev = "" for each in tuples: if(each[1]=="NN"): self.noun.append(each[0]) if(each[1]=="NNS"): self.nouns.append(prev+" "+each[0]) prev = prev+" "+each[0] else: prev = each[0] def searchInWiki(self,guessess): #text = " ".join(self.noun)+" ".join(self.nouns) text = " ".join(self.nouns) print text links = wikipedia.search(text) print ("LINKS") print links for link in links: page = wikipedia.page(link) print page.title # check if guess appears in that page for eachg in guessess: print eachg.replace("_", " ").lower() if(eachg.replace("_", " ").lower() in page.content.lower()): print "founddddddddddddddddddddd" self.freq[eachg] += 1 # Call getScore(self,text,guessess)function from outside, returns dict of scores of wiki appearances def getScore(self,text,guessess): self.freq = defaultdict(int) tuples = self.parse(text) print tuples self.findNounsSeq(tuples) self.searchInWiki(guessess) print self.freq return self.freq
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar') return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger('english-bidirectional-distsim.tagger', 'stanford-postagger.jar') return st.tag(toked_sentence)
class StanfordTagger(WorkflowNativePOSTagger): def __init__(self, xml): from nltk.tag.stanford import POSTagger import os super(StanfordTagger, self).__init__(xml) self.tagger = POSTagger(os.path.join(os.getcwd(),'External/english-bidirectional-distsim.tagger'), os.path.join(os.getcwd(),'External/stanford-postagger.jar')) def is_ascii(self, s): return all(ord(c) < 128 for c in s) def tokenize(self, document): # Non ASCII characters makes the stanford tagger go crazy and run out of heap space if self.is_ascii(document): for word, tag in self.tagger.tag(document): yield "%s/%s" % (word, tag)
def pos_tag(sent, tagger='stanford'): # saves pos_tagger as global variable, # such that it is not recreated everytime pos_tag is executed if not 'pos_tagger' in globals(): global pos_tagger pos_tagger = POSTagger(conf.stanford_pos_model, path_to_jar=conf.stanford_postagger, encoding='UTF-8') if tagger == 'nltk' : tokens = tokenize(sent, 's') return nltk.pos_tag(tokens) elif tagger == 'stanford' : tokens = tokenize(sent,'w') return pos_tagger.tag(tokens) else : raise ValueError('No such tagger: ' + tagger)
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False): # POS TAGGING tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar') tagged_tokens = tagger.tag(tokens) unsorted_kw = OrderedDict() for (w,t) in tagged_tokens: if t in ['NNP', 'NNPS', 'FW']: label = 1.5 elif t in ['NN', 'NNS']: label = 1 else: continue w = w.lower() try: unsorted_kw[w] += label except KeyError: unsorted_kw[w] = label # Get the vectors list token_vecs = OrderedDict() conn = SQLCon(db_path) words = (word.lower() for word in unsorted_kw) for word in words: try: if token_vecs[word]: continue except KeyError: v = conn.read(word) if not v is None: token_vecs[word] = list(v) print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) conn.close() #Compute cluster centers: nk = round(len(token_vecs)/4) data = numpy.array(list(token_vecs.values())) cent, _ = kmeans2(data,nk,iter=20,minit='points') centroids = cent.tolist() # Create the JSON object for this webpage. if not os.path.exists(json_dir): os.makedirs(json_dir) json_path = os.path.join(json_dir,name+'.json') file_dest = open(json_path, 'w') json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest) file_dest.close()
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples8qfa Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger( '/home/satyam/zip/opinionproject/opinion_mining/resources/english-bidirectional-distsim.tagger', '/home/satyam/zip/opinionproject/opinion_mining/resources/stanford-postagger.jar' ) return st.tag(toked_sentence)
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger st = POSTagger( '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar' ) return st.tag(toked_sentence)
def stan_pos(input_sent): """ This function calls stanford POS tagger.In this function Stanford POS tagger directory must be in the same directory.And this function chooses model "wsj left 3 words" as normal POS tagging model. If you want to use other POS tagging models, please change first argument of st = POSTagger() below. """ eval_sent = [] st = POSTagger("./stanford-postagger-2012-11-11/models/wsj-0-18-left3words.tagger","./stanford-postagger-2012-11-11/stanford-postagger.jar") pos_result = st.tag(input_sent.split()) for one_tuple in pos_result: pos_format = one_tuple[0] + "_" + one_tuple[1] eval_sent.append(pos_format) eval_sent = reg_form(eval_sent) return eval_sent
def main(): dict2 = readDict("dict2.txt") sentences2 = readSentences("sentences2.txt") translated2 = translate(sentences2, dict2) print "======================================BASE TRANSLATION==========================================" for sentence in translated2: print sentence print "================================================================================================" st = POSTagger('stanford-postagger/models/english-left3words-distsim.tagger', 'stanford-postagger/stanford-postagger.jar') POS = [] for sentence in translated2: tagged = st.tag(sentence.split()) if (len(tagged)>0): POS.append(tagged) POS = stupidFixes(POS) print "==================================STUPID FIXES TRANSLATION======================================" for sentence in POS: # print sentence # '[%s]' % ', '.join(map(str, sentence)) print ' '.join(map(getWord, sentence)) POS = rulesOneThree(POS) print "=====================================RULE1+3 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence)) POS = rulesFourFiveSeven(POS) print "=====================================RULE4+5+7 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence)) POS = ruleTwoNine(POS) POS = ruleTwoNine(POS) # apply twice print "=====================================RULE2+9 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence)) POS = ruleSixEight(POS) print "=====================================RULE6+8 TRANSLATION========================================" for sentence in POS: print ' '.join(map(getWord, sentence))
def get_transactions(self, product_reviews): ''' Generates a set of transactions ready for frequent itemset mining from the crawled product reviews ''' pos_tagger = POSTagger(PATHS['POS_MODEL'], PATHS['POS_TAGGER']) pos_output = [] transactions_output = [] print 'Generating transactions...' product_count = 0 sentence_count = 0 for product in product_reviews: sentences = sent_tokenize(product) for sentence in sentences: try: sent_pos = pos_tagger.tag(word_tokenize(sentence)) except UnicodeEncodeError: continue trans = [] pos_tags = [] for word, pos in sent_pos: pos_tags.append(':'.join([word, pos])) if ((pos == 'NN' or pos == 'NNS' or pos == 'NP') and re.match('^[A-Za-z0-9-]+$', word)): trans.append(word.lower()) if trans: pos_output.append([sentence] + pos_tags) transactions_output.append([sentence] + trans) sentence_count += 1 product_count += 1 print '---%s Reviews and %s Transactions Parsed---' % ( product_count, sentence_count ) write_csv(PATHS['POS'], pos_output) write_csv(PATHS['TRANSACTIONS'], transactions_output) print 'Finished generating transactions...'
def main(word_transformation = None, result_path = None, save = SAVE, n = 500): tagger = POSTagger('/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger', '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar') tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:] print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w,t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w,t in sent] for sent in tagged_corpus) correct_tags = [[t for w,t in sent] for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i+1) % 5 == 0: print "%d finished" %(i+1) try: ptags = [t for w,t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" %(sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent except Exception: traceback.print_exc() if save: print "dumping to '%s'" %(result_path) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
def main(word_transformation=None, result_path=None, save=SAVE, n=500): tagger = POSTagger( '/cs/fs/home/hxiao/code/CoreNLP/classes/edu/stanford/nlp/models/pos-tagger/english-left3words/english-bidirectional-distsim.tagger', '/cs/fs/home/hxiao/code/CoreNLP/javanlp-core.jar') tagged_corpus = nltk.corpus.treebank.tagged_sents()[-n:] print "extracting sentence words" if word_transformation and callable(word_transformation): tagged_corpus = [[(word_transformation(w), t) for w, t in sent] for sent in tagged_corpus] print "extracting sents/tags" sents = ([w for w, t in sent] for sent in tagged_corpus) correct_tags = [[t for w, t in sent] for sent in tagged_corpus] print "predicting" predicted_tags = [] really_correct_tags = [] # some sentence might be dropped sentences = [] for i, (ctags, sent) in enumerate(zip(correct_tags, sents)): if (i + 1) % 5 == 0: print "%d finished" % (i + 1) try: ptags = [t for w, t in tagger.tag(sent)] if len(ctags) == len(ptags): predicted_tags.append(ptags) really_correct_tags.append(ctags) sentences.append(sent) else: print "tags length does not match for %r" % (sent) except UnicodeDecodeError: print "UnicodeDecodeError for ", sent except Exception: traceback.print_exc() if save: print "dumping to '%s'" % (result_path) dump((really_correct_tags, predicted_tags, sentences), open(result_path, "w"))
def pos_tag_stanford(toked_sentence): """ INPUT: list of strings OUTPUT: list of tuples Given a tokenized sentence, return a list of tuples of form (token, POS) where POS is the part of speech of token """ from nltk.tag.stanford import POSTagger import os basePath = os.getcwd() st = POSTagger( path + '/resources/stanford-postagger-2015-12-09/models/english-bidirectional-distsim.tagger', path + '/resources/stanford-postagger-2015-12-09/stanford-postagger.jar') return st.tag(toked_sentence)
def stanfordTag(modelPath,stanfordJarPath,text,encoding): if not bool(re.search("java.exe", os.getenv("JAVA_HOME"))): java_path=os.getenv("JAVA_HOME")+"/bin/java.exe" os.environ['JAVA_HOME'] = java_path print(java_path) nltk.internals.config_java(java_path) entities = [] stemmer = SnowballStemmer("french") st = POSTagger(modelPath,stanfordJarPath,encoding) print(text.split()) tags=st.tag(text.split()) print(tags) for tag in tags[0]: entity = { 'token': tag[0], 'pos': tag[1], 'stemm' : stemmer.stem(tag[0]) } entities.append(entity) return entities
def tag_tokens(tokens): tagged_sents = [] from nltk.tag.stanford import POSTagger st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger', encoding='utf8') print('Starting to tag sentences') """ Progress Bar: """ toolbar_width = 40 # setup toolbar sys.stdout.write("[%s]" % (" " * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' no_of_sents = len(tokens) no_of_ticks = 0 sent_counter = 0 for line in tokens: # Returns a list of a list of tuples tagged_sents.append(st.tag(line)) # Updating bar sent_counter += 1 trigger = (sent_counter * toolbar_width - 1) / no_of_sents if trigger >= no_of_ticks: while no_of_ticks < math.floor(trigger): sys.stdout.write("-") sys.stdout.flush() no_of_ticks += 1 sys.stdout.write(">]\n") print('Done tagging') return tagged_sents
def get_whole(self, sentence): opinion_dict = dict() pos_f = open('../opinion-lexicon-English/positive-words.txt', 'rb') neg_f = open('../opinion-lexicon-English/negative-words.txt', 'rb') for _ in xrange(35): pos_f.readline() neg_f.readline() for word in pos_f: opinion_dict[word.strip()] = True for word in neg_f: opinion_dict[word.strip()] = False pos_f.close() neg_f.close() stemmer = PorterStemmer() stanford_parser = parser.Parser() stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar') w = open('sentence_test', 'wb') text_token = self.tf.stanford_tokenize(sentence) text_pos = stanford_tagger.tag(text_token) print text_pos text_dependency = stanford_parser.parseToStanfordDependencies(sentence) temp_list = ['none'] * len(text_token) for dep in text_dependency: if dep[0] == 'amod': temp_list[int(dep[1])] = '%s_1' % dep[0] temp_list[int(dep[2])] = '%s_2' % dep[0] #end for for num, item in enumerate(text_pos[0]): temp_str = 'order' if opinion_dict.has_key(item[0]): temp_str = 'opion' featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\ temp_str,temp_list[num],'O'] w.write(' '.join(featrue_list) + '\n') pass
def get_whole(self,sentence): opinion_dict = dict(); pos_f = open('../opinion-lexicon-English/positive-words.txt','rb'); neg_f = open('../opinion-lexicon-English/negative-words.txt','rb'); for _ in xrange(35): pos_f.readline(); neg_f.readline(); for word in pos_f: opinion_dict[word.strip()]=True; for word in neg_f: opinion_dict[word.strip()]=False; pos_f.close(); neg_f.close(); stemmer = PorterStemmer(); stanford_parser = parser.Parser(); stanford_tagger = \ POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar'); w = open('sentence_test','wb'); text_token = self.tf.stanford_tokenize(sentence); text_pos = stanford_tagger.tag(text_token); print text_pos; text_dependency = stanford_parser.parseToStanfordDependencies(sentence); temp_list = ['none']*len(text_token); for dep in text_dependency: if dep[0] == 'amod': temp_list[int(dep[1])]='%s_1'%dep[0]; temp_list[int(dep[2])]='%s_2'%dep[0]; #end for for num,item in enumerate(text_pos[0]): temp_str = 'order'; if opinion_dict.has_key(item[0]): temp_str = 'opion'; featrue_list=[item[0],item[1],stemmer.stem(item[0]),item[0].lower(),\ temp_str,temp_list[num],'O']; w.write(' '.join(featrue_list)+'\n'); pass;
def extract_examples(self): training_tuples = set() db_fh = open(self.database_loc, 'rb') for line in db_fh: #going through PPDB elements = line.strip().split(' ||| ') if len(elements[1].split()) == 2 or len(elements[2].split()) == 2: #only look at 2-to-1 or 1-to-2 paraphrases many_phrase = elements[1] if len(elements[1].split()) == 2 else elements[2] one_phrase = elements[1] if len(elements[1].split()) == 1 else elements[2] if self.filter_number: #filter numbers, these are useless isNumber = False for token in many_phrase.split(): if self.pos_provided: token = token.split('#')[0] if self.is_number(token): isNumber = True if not isNumber: training_tuples.add((one_phrase, many_phrase)) else: training_tuples.add((one_phrase, many_phrase)) tagger = POSTagger(self.TAGGER_MODEL, self.TAGGER_LOC) self.training_examples = {} #reset training examples for element in training_tuples: #now, tag the resulting data words = element[1].split() words_only = "" if self.pos_provided: #if pos tags provided externally can just merge them here otherwise call the tagger words_only = ' '.join([word_pos.split('#')[0] for word_pos in words]) pos_tags = [word_pos.split('#')[1] for word_pos in words] if self.pos_provided else [word_pos[1] for word_pos in tagger.tag(words)] collapsed_pos = [] for pos in pos_tags: #cluster certain pos tags together new_pos = collapsePOS(pos) collapsed_pos.append(new_pos) key = ' '.join(collapsed_pos) examples = self.training_examples[key] if key in self.training_examples else [] if self.pos_provided: examples.append(' '.join([element[0], words_only])) else: examples.append(' '.join([element[0], element[1]])) self.training_examples[key] = examples sys.stderr.write("PPDB training data tagged and sorted\n") db_fh.close()
import os from nltk import * from nltk.tag.stanford import POSTagger from nltk.stem.wordnet import WordNetLemmatizer PATH_TO_TAGGER = os.path.join(os.getcwd(), "lib\\wsj-0-18-bidirectional-nodistsim.tagger") PATH_TO_JAR = os.path.join(os.getcwd(), "lib\\stanford-postagger.jar") print pos_tag("the sea touches me".split()) stanford_tagger = POSTagger(PATH_TO_TAGGER,PATH_TO_JAR) print stanford_tagger.tag(word_tokenize("which ocean touches the state of California ?"))
class PersianPipeline: def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir): try: self.logger = logging.getLogger(__name__) self.posTagger = POSTagger(posTagModelPath, posTaggerPath,encoding="UTF-8", java_options='-Xmx16000m') #self.posTagger = POSTagger(posTagModelPath, posTaggerPath,"UTF-8") #print "pos tagger is loaded" except: self.logger.warning("Error in loading POS tagger!") e = sys.exc_info()[0] self.logger.warning("Error:" + str(e)) try: self.parser = MaltParser(tagger=None, mco = parserModelPath, working_dir= workingDir, additional_java_args=['-Xmx16000m']) #print "parser is loaded" except: self.logger.warning("Error in loading the MALT Parser") e = sys.exc_info()[0] self.logger.warning("Error:" + str(e)) # tokenizes, fixes some of the detached affixes def preprocess(self, s): # remove the diacritics drs = s for c in range(1611, 1619): drs = drs.replace(unichr(c),"") # normalize the Arabic yaa drs = drs.replace(unichr(1610),unichr(1740)) drs = drs.replace(unichr(1609),unichr(1740)) # tokenize the sentence ts = self.seperatePuncs(drs) # fix the affixes afs = self.fixAffixes(ts) # replace slashes and pounds and underlines afs= afs.replace("#","-") afs= afs.replace("/","-") afs = afs.replace("_","-") return afs def preprocess4Annotation(self, s): ps = self.preprocess(s) ts = self.posTagASentence(ps) if ts: print "pos tagged" else: print "tagging failed" attS = self.attachPerCompounds(ts) #" ".join(attS.split) # get the first element of pos tuples and join them to form the sentence # replace the ^ sign (from compound attaching) with space finalS = " ".join(map(lambda x: x[0].replace("^"," "), attS)) return finalS # tokenize a persian sentence def seperatePuncs(self, s): s = re.sub(ur"([\[{\(\\`\"‚„†‡‹‘’“”•\.–—›««])", r"\1 ", s) s = re.sub(ur"([\]}\'\`\"\),;:!\?\%‚„…†‡‰‹‘’“”•–—›»\.])", r" \1", s) # persian specific s = re.sub(ur"([،؛؟،\.])", r" \1 ", s) s = s.replace(" ", " ") return s def fixAffixes(self, sent): suffList = [u"ها", u"های"] sSent = sent.split(" ") newTokSent = [] sentLen = len(sSent) i = 0 try: while i < sentLen: if sSent[i] in suffList and newTokSent: #print "+++ Affix problem got fixed" # attach the suffix to the previous word newTokSent[-1] = newTokSent[-1] + u"\u200c" + sSent[i] else: newTokSent.append(sSent[i]) i += 1 return " ".join(newTokSent) except: return sent def posTagASentence(self, sent): try: sent = sent.replace("/","-") posSent = self.posTagger.tag(sent.split()) return posSent except: self.logger.warning("problem in pos!" + sent) return None # Function reads in a POS tagged sentence (list) and if there are two adjacent verbs, it attaches them together and make them one word. def attachPerCompounds(self, posSent): prFlag = False ct = senCt = prCt = 0 i = 0 senCt += 1 pos = wd = outWd = "" sentLen = len(posSent) newPOSSent = [] while i < sentLen - 1: ct += 1 tok = posSent[i] nexTok = posSent[i+1] (wd, pos) = tok (nwd, npos) = nexTok outWd = wd if pos == "V": if npos == "V": prFlag = True outWd = wd + '^' + nwd pos = "V" i += 1 # attaching the "mi" prefix for present continious form if npos == "V" and wd.strip() == u"می": prFlag = True outWd = u"می" + u"\u200c" + nwd pos = "V" i += 1 #print "the mi case " #t.write("outWd:" + outWd + "\n") newPOSSent.append((outWd, pos)) i += 1 # don't forget the last word (if not processed) if i < sentLen: ct += 1 tok = posSent[-1] newPOSSent.append(tok) # counting the lines with compound verbs patterns if prFlag: prCt += 1 #print prCt #t.write(newPOSSent[-2][0] + "--" + newPOSSent[-1][0] + "\n") return newPOSSent ################################################################ def parseATaggedSentence(self, tSent): try: compTSent = self.attachPerCompounds(tSent) depParse = self.parser.tagged_parse(compTSent) if depParse: pl = depParse.to_conll(10).replace("^", " ") return pl else: return None except Exception, e: print "Error in parsing a sentence!" + str(e) return None
def evaluate(granularity, text): preprocessor = Preprocessor() entry = TextEntry() entry.body = text preprocessor.entries = [entry] data = preprocessor.get_clean_data() ncharsAll = preprocessor.getNChars(items=data, freq=20) test_data_raw = preprocessor.get_clean_data() test_raw_text = preprocessor.get_raw_words() count_vect = joblib.load('../models/t1/vec_count.joblib') tfidf_transform = joblib.load('../models/t1/tfidf_transform.joblib') data_counts = count_vect.transform(test_data_raw) test_data = tfidf_transform.transform(data_counts) dense_test = test_data.toarray() vocab = count_vect.vocabulary_ nchars = [] for nchar in ncharsAll: if nchar not in vocab: nchars.append(nchar) numOfTags = len(tags) ncharVecSize = len(nchars) tag_vecs = [] pos = POSTagger(model, jar, java_options='-mx2500m') for i, text in enumerate(test_raw_text): if i % 10 == 0: print(i) words = text.split() tag_vector = np.zeros(numOfTags) words_with_tags = pos.tag(words) only_tags = [tag for word, tag in words_with_tags[0]] tags_with_freq = Counter(only_tags) for tag, freq in tags_with_freq.items(): tag_vector[tags.index(tag)] = freq / len(words) tag_vecs.append(tag_vector) for i, text in enumerate(test_raw_text): if i % 100 == 0: print(i) words = text.split() ncharVec = np.zeros(ncharVecSize) for word in words: for size in sizes: text_nchars = [ word[i:i + size] for i in range(len(word) - size + 1) ] text_nchars_with_freq = Counter(text_nchars) for nchar, freq in text_nchars_with_freq.items(): if nchar in nchars: ncharVec[nchars.index(nchar)] = freq / len(words) test_data[i] = np.concatenate((dense_test[i], ncharVec, tag_vecs[i])) svm_l = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' + granularity + '.joblib') svm_u = joblib.load('../models/t1/svm_l_' + granularity + '/svm_l_' + granularity + '.joblib') evaluator = ClfEval(svm_l, svm_u) return evaluator.eval_data(csr_matrix(test_data))
class TextParser: taggedText = Counter() tagList = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] #Penn treebank tags tagCriteria = ('DT', 'EX', 'JJ', 'MD', 'NN', 'POS', 'PRP', 'RB', 'VB', 'VBD', 'VBG', '#', '$', "'", ',') stanfordTagger = None #config_java("C:\Program Files\Java\jdk1.6.0_37\\bin\java.exe") def __init__(self, pathToParser=None, javaHeapOptions='-Xmx4g -XX:+UseParallelGC -XX:-UseGCOverheadLimit'): if pathToParser is None: taggerLibraryPath = normpath(os.path.join(os.getcwd(), "sp/jar/stanford-postagger.jar")) taggerModelPath = normpath(os.path.join(os.getcwd(), "sp/models/english-bidirectional-distsim.tagger")) else: taggerLibraryPath = normpath(os.path.join(pathToParser, "sp/jar/stanford-postagger.jar")) taggerModelPath = normpath(os.path.join(pathToParser, "sp/models/english-bidirectional-distsim.tagger")) self.stanfordTagger = POSTagger(taggerModelPath, taggerLibraryPath, java_options=javaHeapOptions) """ print "---" print "Tagger library path: " + taggerLibraryPath print "Tagger model path: " + taggerModelPath print "---" """ def tagTextFile(self, documentName, textFilePath, useCriteria=False): tempTaggedText, finalList = [], [] textFile = readFromFile(textFilePath) for line in textFile.splitlines(): tempTaggedText.extend(self.stanfordTagger.tag(line.split())) if useCriteria: for x, y in tempTaggedText: if y in self.tagCriteria: finalList.append((x, y)) else: for x, y in tempTaggedText: finalList.append((x, y)) self.taggedText[documentName] = finalList def getTagCountVector(self, textString): splitString = textString.split() numberOfWords = len(splitString) tempTaggedText = self.stanfordTagger.tag(splitString) counterVector = Counter([y for x, y in tempTaggedText if y in self.tagList]) #Get tags resultantVector = OrderedDict() for k in self.tagList: if k in counterVector: resultantVector[k] = float(counterVector[k])/numberOfWords else: resultantVector[k] = 0 return resultantVector def tagText(self, documentName, textString, useCriteria=False): tempTaggedText, finalList = [], [] for line in textString.splitlines(): tempTaggedText.extend(self.stanfordTagger.tag(line.split())) if useCriteria: for x, y in tempTaggedText: if y in self.tagCriteria: finalList.append((x, y)) else: for x, y in tempTaggedText: finalList.append((x, y)) self.taggedText[documentName] = finalList def getEmailFromString(self, emailString): message = Parser().parsestr(emailString) return (message, message.is_multipart()) def ngram(self, textString, n=3): #Defaults to tri-gram return ngrams(textString.split(), n)
length = 0 i = 0 for fname in os.listdir('test_data'): if fname.endswith('.edus'): print i print fname i = i + 1 f = open(os.path.join('test_data', fname), 'r') mys1 = os.path.join('test_data', fname.split(".")[0] + ".pos") print mys1 pos = open(mys1, "w") data = f.read().splitlines() for line in data: if len(line) > length: length = len(line) wordb = word_tokenize(line) tags = english_postagger.tag(wordb) pos.write(str(line.strip())) pos.write("@#%^&*") for tgpair in tags: pos.write(str(tgpair[1])) pos.write("\t") pos.write("\n") # print i # i=i+1 # print length # continue;
#!/usr/bin/env python # -*- coding: utf-8 -* import numpy import nltk from nltk.tag.stanford import POSTagger import sys if len(sys.argv) != 2: print 'must have one argument' sys.exit() chunk = sys.argv[1].decode('utf-8') #chunk = u"妈我" text = nltk.word_tokenize(chunk.encode('utf-8')) st = POSTagger('chinese-distsim.tagger', 'stanford-postagger-3.1.4.jar') tsentence = st.tag(text) # print tsentence for w in tsentence: # print w # print w[1].decode('utf-8'), print w[1].split('#')[1]
# importo nltk con el tagger spanish de Stanford from nltk.tag.stanford import POSTagger spanish_postagger = POSTagger('c:/stanford-postagger/models/spanish.tagger', 'c:/stanford-postagger/stanford-postagger.jar') # Leeo los textos de los articulos en la base de datos con = psycopg2.connect(CHATBOT_CONNECTION_STRING) cursor = con.cursor() cursor.execute("SELECT texto FROM reglamentacion.articulos") textos = cursor.fetchall() # recorro los textos, hago el POS tagger de cada texto, y agrego en una lista las palabras que son sustentivos, # adjetivos y verbos unicamente for texto in textos: listaPalabras = spanish_postagger.tag(texto[0].split()) for palabras in listaPalabras: for pal in palabras: if pal[1][0:1] == 'v': cursor.execute( "INSERT INTO reglamentacion.palabras(id, palabra, tipo) VALUES (DEFAULT, '" + pal[0] + "', 'V')") con.commit() if pal[1][0:1] == 'n': cursor.execute( "INSERT INTO reglamentacion.palabras(id, palabra, tipo) VALUES (DEFAULT, '" + pal[0] + "', 'S')") con.commit() if pal[1][0:1] == 'a': cursor.execute( "INSERT INTO reglamentacion.palabras(id, palabra, tipo) VALUES (DEFAULT, '"
class yagoScores: def __init__(self): self.cnx = pymysql.connect(user='******', database='yago', password = '******') self.cursor = self.cnx.cursor() self.query = "select * from yagoFacts where t1='%s' or t3='%s'" self.en_postagger = POSTagger('parser/models/english-bidirectional-distsim.tagger', 'parser/stanford-postagger.jar') self.stopwords = nltk.corpus.stopwords.words('english') def parse(self,text): return self.en_postagger.tag(text.split()) def get_underscoreWords(self,text): return re.findall("[a-z]+_[a-z]+", text) def findNounsSeq(self,tuples): self.noun = [] self.nouns = [] prev = "" for each in tuples: if(each[1]=="NN"): self.noun.append(each[0]) if(each[1]=="NNS"): self.nouns.append(prev+" "+each[0]) prev = prev+" "+each[0] else: prev = each[0] def changeToYagoFormat(self, g): g=g.strip() g=g.replace("'","") char = [c for c in g] char[0] = char[0].upper() prev = False for i in range(0,len(g)): if(prev == True): char[i] = char[i].upper() prev = False; if(char[i]=="_"): prev = True; return "<"+"".join(char)+">" def getFacts(self, g): facts = [] #print self.query%(g,g) self.cursor.execute(self.query%(g,g)) for each in self.cursor: #each = each.replace("<","") #each = each.replace(">","") facts.append([each[1],each[2],each[3]]) return facts def generateFeatures(self,tuples,facts): t2 = 0 t1_t3 = 0 for f in facts: #print f f = str(f[0].decode('ascii', 'ignore'))+str(f[1].decode('ascii', 'ignore'))+str(f[2].decode('ascii', 'ignore')) f = f.lower() f = f.split(">") f = f[:3] for i in range(0,3): f[i] = f[i].replace("_"," ") f[i] = f[i].replace("<","") f[i] = f[i].replace(")","") f[i] = f[i].replace("(","") f[i] = f[i].replace("-"," ") #print f #print tuples.split() for each in tuples.split(): if(each not in self.stopwords): each = str(each).lower() each = each.replace("_"," ") if(len(each)>2 and each in f[1]): print (each+"---"+f[1]) t2 += 1 #print each #print (f[0].split()) if(len(each)>2 and (each in f[0].split() or each in f[2].split())): print (each+"------------"+f[0]+"----"+f[2]) t1_t3 += 1 # TODO Now returns only total similarities #print verbs """ bucket = 0 if(verbs == 0): bucket = 0 elif(verbs > 1 and verbs < 6): bucket = 1 elif(verbs >= 6 and verbs < 11): bucket = 2 elif(verbs >= 11 and verbs < 16): bucket = 3 elif(verbs >= 16 and verbs < 21): bucket = 4 elif(verbs >= 21 and verbs < 26): bucket = 5 elif(verbs >= 26 and verbs < 31): bucket = 6 elif(verbs >= 31 and verbs < 40): bucket = 7 else: bucket = 8 return bucket; """ return [t2, t1_t3]; def searchInYago(self,text,guess): eachGuess = self.changeToYagoFormat(guess) facts = self.getFacts(eachGuess) #print facts count = self.generateFeatures(text,facts) return count # Call getScore(self,text,guessess)function from outside, returns dict of scores of wiki appearances # input, (text, list of guesses) def getScore(self,text,guessess): #print (text+guessess) self.freq = defaultdict(int) #print ("IN GUESSS") #tuples = self.parse(text) #print tuples #self.findNounsSeq(tuples) #return self.searchMultipleInYago(tuples,guessess) return self.searchInYago(text,guessess.strip())
>>>from nltk import word_tokenize >>>s="I was watching TV" >>>print nltk.pos_tag(word_tokenize(s)) # all nouns >>>tagged=nltk.pos_tag(word_tokenize(s)) >>>allnoun=[word for word,pos in tagged if pos in ['NN','NNP'] ] # Stanford POS tagger >>>from nltk.tag.stanford import POSTagger >>>import nltk >>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar') >>>tokens =nltk.word_tokenize(s) >>>stan_tagger.tag(tokens) # POS tags freq distribtuion >>>from nltk.corpus import brown >>>import nltk >>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')] >>>print nltk.FreqDist(tags) # default tagger >>>brown_tagged_sents = brown.tagged_sents(categories='news') >>>default_tagger = nltk.DefaultTagger('NN') >>>print default_tagger.evaluate(brown_tagged_sents) # N-gram taggers >>>from nltk.tag import UnigramTagger
from nltk.tag.stanford import POSTagger postagger = POSTagger( "./stanford-postagger-full-2014-10-26/models/english-bidirectional-distsim.tagger", "./stanford-postagger-full-2014-10-26/stanford-postagger.jar") print postagger.tag('What is the airspeed of an unladen swallow ?'.split())
import nltk from nltk.tag.stanford import POSTagger from nltk.tag.stanford import POSTagger st = POSTagger('stanford-postagger-2014-01-04/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-01-04/stanford-postagger.jar') sentence='Himanshu Bindal is a genius?' taggedSentence= st.tag(nltk.word_tokenize(sentence)) print taggedSentence
import urllib import nltk from bs4 import BeautifulSoup from html2text import html2text import re #response = urllib.request.urlopen('http://python.org') #html =response.read() #数据清理 #soup = BeautifulSoup(html,'html.parser') #clean =soup.get_text() #print (clean) #词性 from nltk import word_tokenize s =" I was a teacher, I am watching TV" print (nltk.pos_tag(word_tokenize(s))) from nltk.tag.stanford import POSTagger stan_tagger = POSTagger('/Users/wang/dev/stanford-postagger/models/english-bidirectional-distdim.tagger','/Users/wang/dev/stanford-postagger/stanford-postagger-3.9.1.jar') tokens = word_tokenize(s) print ('Stanord Tagger') print (stan_tagger.tag(tokens)) #命名实体识别 NER from nltk import ne_chunk Sent =" Here is Stallman, he was working at HSBC Co. LTD before " print (ne_chunk(nltk.pos_tag(word_tokenize(Sent)),binary=False))
def pos_stanford(tokens): tagger = POSTagger('./english-bidirectional-distsim.tagger', './stanford-postagger.jar') return tagger.tag(tokens)
] tokens = [t for t in tokens if t not in non_tech_words and t.isalpha()] print tokens # Part of Speech Tagging tags = [] starti = 0 endi = 0 no_chunks = len(tokens) / 5000 print 'Process ' + str( len(tokens)) + ' tokens in ' + str(no_chunks) + ' chunks..' for l in range(0, no_chunks): endi = min((starti + (len(tokens) / no_chunks)), len(tokens)) print "Tagging #" + str(l) + ": from " + str(starti) + " to " + str(endi - 1) #tags = tags + nltk.pos_tag(tokens[starti:endi]); tags = tags + pos.tag(tokens[starti:endi])[0] starti = endi print "\n" + str(len(tags)) + " words tagged.." # Save all the Noun and Adjective unigrams in a hash table Tag_set = {'Word': 'Tag'} for tag in tags: if (cleanseNN([str(tag[1])]) in patterns[0:2]): Tag_set[str(tag[0])] = str(tag[1]) #print cleanseNN([str(tag[1])]) #print tags #print '\n' print Tag_set # Look for longest n-gram appearing in each sentence with the patterns of technical terms
from nltk import * from nltk.tag.stanford import POSTagger PATH_TO_TAGGER=r'C:\\AptanaWorkspace\\Thesis\\src\\lib\\english-left3words-distsim.tagger' PATH_TO_JAR=r'C:\\AptanaWorkspace\\Thesis\\src\\lib\\stanford-postagger.jar' st = POSTagger(PATH_TO_TAGGER,PATH_TO_JAR) s1="Where is the nearest city to Columbus?" s2="Where is the nearest city to Ohio State University?" tagged_question=st.tag(word_tokenize(s2)) s= corpus.treebank.tagged_sents()[22] print s print tagged_question print ne_chunk(tagged_question)
class PersianPipeline: def __init__(self, posTagModelPath, posTaggerPath, parserModelPath, workingDir): try: self.posTagger = POSTagger(posTagModelPath, posTaggerPath, "UTF-8") print "pos tagger is loaded" except: print "Error in loading POS tagger" try: self.parser = MaltParser(tagger=None, mco=parserModelPath, working_dir=workingDir) print "parser is loaded" except: print "Error in loading the MALT Parser" # tokenizes, fixes some of the detached affixes def preprocess(self, s): # remove the diacritics drs = s for c in range(1611, 1619): drs = drs.replace(unichr(c), "") # tokenize the sentence ts = self.seperatePuncs(drs) # fix the affixes afs = self.fixAffixes(ts) # replace slashes and pounds and underlines afs = afs.replace("#", "-") afs = afs.replace("/", "-") afs = afs.replace("_", "-") return afs # tokenize a persian sentence def seperatePuncs(self, s): s = re.sub(ur"([\[{\(\\`\"‚„†‡‹‘’“”•\.–—›««])", r"\1 ", s) s = re.sub(ur"([\]}\'\`\"\),;:!\?\%‚„…†‡‰‹‘’“”•–—›»\.])", r" \1", s) # persian specific s = re.sub(ur"([،؛؟،\.])", r" \1 ", s) s = s.replace(" ", " ") return s def fixAffixes(self, sent): suffList = [u"ها", u"های"] sSent = sent.split(" ") newTokSent = [] sentLen = len(sSent) i = 0 while i < sentLen: if sSent[i] in suffList: print "+++ Affix problem got fixed" # attach the suffix to the previous word newTokSent[-1] = newTokSent[-1] + u"\u200c" + sSent[i] else: newTokSent.append(sSent[i]) i += 1 return " ".join(newTokSent) def posTagASentence(self, sent): try: sent = sent.replace("/", "-") posSent = self.posTagger.tag(sent.split()) return posSent except: return None # Function reads in a POS tagged sentence (list) and if there are two adjacent verbs, it attaches them together and make them one word. def attachPerCompounds(self, posSent): prFlag = False ct = senCt = prCt = 0 i = 0 senCt += 1 pos = wd = outWd = "" sentLen = len(posSent) newPOSSent = [] while i < sentLen - 1: ct += 1 tok = posSent[i] nexTok = posSent[i + 1] (wd, pos) = tok (nwd, npos) = nexTok outWd = wd if pos == "V": if npos == "V": prFlag = True outWd = wd + '^' + nwd pos = "V" i += 1 # attaching the "mi" prefix for present continious form if npos == "V" and wd.strip() == u"می": prFlag = True outWd = u"می" + u"\u200c" + nwd pos = "V" i += 1 #print "the mi case " #t.write("outWd:" + outWd + "\n") newPOSSent.append((outWd, pos)) i += 1 # don't forget the last word (if not processed) if i < sentLen: ct += 1 tok = posSent[-1] newPOSSent.append(tok) # counting the lines with compound verbs patterns if prFlag: prCt += 1 #print prCt #t.write(newPOSSent[-2][0] + "--" + newPOSSent[-1][0] + "\n") return newPOSSent ################################################################ def parseATaggedSentence(self, tSent): try: compTSent = self.attachPerCompounds(tSent) depParse = self.parser.tagged_parse(compTSent) return depParse except: print "Error in parsing a sentence!" return None def parseASentence(self, sent): pass
parsed = open('combine_parsed', 'r').read() reviews = parsed.split('> (') processed_reviews = [] pos_sentence = {} for review in reviews: if review != '': review = review.strip()[:-1] review = review.split('\n')[:-1] processed_items = [] for item in review: item = item.split('\t') item[1] = item[1][1:] item[-1] = item[-1][:-1] item[2] = item[2].split(' ') processed_items.append(item) processed_reviews.append(processed_items) for ind, review in enumerate(processed_reviews): tokens = [] for item in review: if item[0][0] != 'E': tokens.append(item[1]) pos_sentence[ind] = english_postagger.tag(tokens) postagger = cPickle.dump(pos_sentence, open('pos_combine', 'wb'))
conn = connect("dbname=Ohio user=postgres password=ohiostate") cur = conn.cursor() # tag new sentences and update the database question_type_id=None with open("new_question",'r') as fr: while(1): line=fr.readline() line=line.strip() if line=='': # EOF break if line[0]=='#': question_type_id=int(line.split(' ')[1]) print question_type_id,line count=0 else: tagged_question=tagger.tag(word_tokenize(line)) tag=' '.join([t for w,t in tagged_question]) tagged=' '.join([w+'/'+t for w,t in tagged_question]) try: sql = """INSERT INTO template (tag, question_type_id) values ('%s',%d)""" % \ (tag,question_type_id) print sql cur.execute(sql) conn.commit() except Exception,e: print str(e) conn.rollback() try: sql = """INSERT INTO question (sentence,tagged,tag,question_type_id) values ('%s','%s','%s',%d)""" % \ (line.replace("'", "''"),tagged.replace("'", "''"),tag,question_type_id) print sql
from nltk.tag.stanford import POSTagger from sidd.paraphraser.Paraphraser import * import nltk import os # lm=ARPALanguageModel('../../jars/lm_csr_5k_vp_2gram.arpa',encoding='utf-8') # # print lm.score('Hello how are you') os.environ[ 'JAVA_HOME'] = 'C:/Program Files/Java/jdk1.7.0_17/bin' ##Lab desktop from sidd.paraphraser import PPDBLoader english_postagger = POSTagger('../../jars/english-left3words-distsim.tagger', '../../jars/stanford-postagger.jar', encoding='utf-8') syntacticMap = PPDBLoader.createSyntacticParaphraseMap('s', '../../ppdb') #sentence='An Emirates and an Etihad aircraft, flying in opposite directions came in proximity of each other over the Indian Ocean, leading to a collision alert warning in the two cockpits on Sunday night.' #orig_Sentence=sentence.decode('utf-8', errors='replace') sentence = 'The box is thrown.' orig_Sentence = sentence sentence = english_postagger.tag(nltk.word_tokenize(sentence)) print sentence[0][0] print sentence modSentence = sentenceTuple(sentence) ppCandidateList = generateNGramCandidatesToChange(modSentence, MAX_NGRAMS=4) #all_possible_transformations=generateListOfPossibleTransformations(ppCandidateList, # LexicalPPDict=LexicalPPDict, # PhrasalPPDict=PhrasalPPDict, # stopwords=stopwordList, # useIdentities=False)