def convert_eng_to_isl(input_string): # get all required packages download_required_packages() if len(list(input_string.split(' '))) is 1: return list(input_string.split(' ')) # Initializing stanford parser parser = StanfordParser() # Generates all possible parse trees sort by probability for the sentence possible_parse_tree_list = [ tree for tree in parser.parse(input_string.split()) ] # Get most probable parse tree parse_tree = possible_parse_tree_list[0] print(parse_tree) # output = '(ROOT # (S # (PP (IN As) (NP (DT an) (NN accountant))) # (NP (PRP I)) # (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment)))))) # ) # )' # Convert into tree data structure parent_tree = ParentedTree.convert(parse_tree) modified_parse_tree = modify_tree_structure(parent_tree) parsed_sent = modified_parse_tree.leaves() return parsed_sent
def POS_data(self): """POS sentences""" tag = 'pos' idx = 19 file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx) with open(file_name, 'r') as file: sentences = file.read().strip().split('\n') stop_words = stopwords.words('english') eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') eng_parser.java_options = '-mx3000m' print('=' * 100) print('current tag: {}, file idx: {}'.format(tag, idx)) '''POS''' print('=' * 100) print('Starting POS...') pos_sent = [] for sent in tqdm(sentences): pos_sent.append(list(eng_parser.parse( [w for w in sent.split()]))[0]) '''save file''' save_file = 'data/{}_sent/{}_sent_{}.csv'.format(tag, tag, idx) with open(save_file, mode='w') as file: for sent, pos in zip(sentences, pos_sent): file.write(sent + '\t') file.write(str(pos) + '\t') print('Finish! Saved in {}'.format(save_file))
def clean_apriori_data(self, sentences): """ filter apriori data methods: - clean stop words - stemming - fuzzy matching within sentence """ stop_words = stopwords.words('english') eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') if config.apriori_test_size < 6: for sent in sentences: print(sent) '''POS''' pos_sent = [] for sent in sentences: pos_sent.append(list(eng_parser.parse( [w for w in sent.split()]))[0]) '''filter noun phrase & NLTK stemming''' cleaned_sent = [] for sent in pos_sent: wnl = WordNetLemmatizer() tmp_sent = [] for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'): '''clean stop words & stemming''' tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words] '''lenght <= 3 & filter repeated list''' if 0 < len(tmp) <= 3 and tmp not in tmp_sent: tmp_sent.append(tmp) cleaned_sent.append(tmp_sent) return pos_sent
def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.status = 0 self.trans = googletrans.Translator() self.sentence = sentence.strip("\n").replace(" ", "") en_trans = self.trans.translate(sentence).text en_trans = sg.tokenize(en_trans) try: tree = list(en_parser.parse(en_trans)) self.tree = tree[0] # print(self.tree) self.rel = [] except: self.status = 1
class NLPParser(object): def __init__(self): self.eng_parser = StanfordParser( model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') def tag_pos(self, text): res = list(self.eng_parser.parse(text.split(' ')))[0].pos() return [t[0] for t in res], [t[1] for t in res]
def parser(sentence): chi_parser = StanfordParser( path_to_jar=path_dit.get('path_to_jar'), path_to_models_jar=path_dit.get('path_to_models_jar'), model_path=path_dit.get('model_path')) re = chi_parser.parse(sentence.split()) return re
def parser(tokens): from nltk.parse.stanford import StanfordParser chi_parser = StanfordParser( r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser.jar", r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar", r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp" r"\models\lexparser\chinesePCFG.ser.gz") print(list(chi_parser.parse(tokens)))
class RelationExtractor: ''' relation extraction ''' def __init__( self, model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ): ''' initialization ''' self.eng_parser = StanfordParser() # self.eng_parser_dependency = StanfordDependencyParser() self.eng_parser_dependency = StanfordNeuralDependencyParser() def PCFG_parse(self, sentence, draw_graph=True): res = list(self.eng_parser.parse(sentence.split())) if draw_graph: res[0].draw() return res def dependency_parse(self, sentence, draw_graph=True): res = list(self.eng_parser_dependency.parse(sentence.split())) if draw_graph: res[0].tree().draw() return res def generate_relation(self, sentence, nes, draw_graph=False, dep_path_max=10**2): pairs = [(nes[i], nes[j]) for i in range(len(nes) - 1) for j in range(i + 1, len(nes)) if nes[i][1] != nes[j][1]] if len(sentence.split()) > 60 or len(pairs) < 3: return def get_relation(n1, n2): get_range = lambda n: range(n[2] + 1, n[3] + 2) e1ind, e2ind = get_range(n1), get_range(n2) dep_path = nx.shortest_path(G, source=e1ind[-1], target=e2ind[-1]) vbs = filter(lambda n: G.node[n]['tag'].startswith('VB'), dep_path) if len(dep_path) <= dep_path_max and vbs: ws = sentence.split() r = G.node[vbs[-1]]['word'] e1, e2 = ' '.join(ws[i - 1] for i in e1ind), ' '.join(ws[i - 1] for i in e2ind) print '{0}\n{1} | {2} | {3} | {4}'.format( sentence, e1, e2, r, len(dep_path)) return e1, e2, r, len(dep_path) else: return None, None, None, None rels = [] res = self.dependency_parse(sentence, draw_graph=False) G = nx.Graph() nodes = {} edges = [] return res[0].nodes.items()
def __init__(self, sentence): en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar', path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar', ) sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar') self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).text) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel=[]
def parser_tree(self, sent): seg_sent = self.segment(sent) chi_parser = StanfordParser( model_path=r"edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz") sentences = list(chi_parser.parse(seg_sent.split())) #result = chi_parser.raw_parse(seg_sent) #tree = chi_parser.parse(seg_sent.split()) #print(sentences) #GUI #for sentence in sentences: # sentence.draw() return sentences[0]
def parser(): os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar' os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m") for x in content: a = list(eng_parser.parse(x.split()))[0] print(a) # a.draw() eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') for x in content: a = list(eng_dep_parser.parse(x.split()))[0] for row in a.triples(): print(row)
def pos_test(): stop_words = stopwords.words('english') eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') eng_parser.java_options = '-mx3000m' sentence = "so now i run out take a few shot then run back in the house and xfer them to the pc" res = eng_parser.parse([w for w in sentence.lower().split()]) lst_res = list(res)[0] with open('data/tree_test.txt', 'w') as file: file.write(sentence + '\t') file.write(str(lst_res) + '\t') # print(lst_res) lst_res.pretty_print() # lst_res.remove(Tree('NN', ['camera'])) cleaned_sent = [] for sent in lst_res: wnl = WordNetLemmatizer() tmp_sent = [] for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'): '''clean stop words & stemming''' tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words] '''lenght <= 3 & filter repeated list''' if 0 < len(tmp) <= 3 and tmp not in tmp_sent: tmp_sent.append(tmp) cleaned_sent.append(tmp_sent) # get opinion word # for w in cleaned_sent[0]: # print(w) # words = sentence.split() # min_dist = len(words) # min_asp = w # for s in lst_res.subtrees(lambda t: t.label() == 'JJ'): # if abs(words.index(s.leaves()[0]) - words.index(w[0])) < min_dist: # min_dist = pos_test # min_asp = s.leaves()[0] # # if min_asp == w: # print('not found') # else: # print(min_asp) print(cleaned_sent)
def parser_nltk(word_lists, filename): os.environ['JAVAHOME'] = JAVA_PATH os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS chinese_parser = StanfordParser(model_path=nltk_parse_model_path) STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0] chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR)) chinese_parser.java_options = '-mx15000m' all_parser_sentence = [] file = shelve.open(filename) flag = 0 for sentence in word_lists: if sentence.strip() != "": res = list(chinese_parser.parse((sentence.strip()).split())) new_str = return_str_tofile(sentence_parse=str(res[0])) file[str(flag)] = res all_parser_sentence.append(new_str) flag += 1 print("###### NLTK Dependency Parser Have finished " + str(flag) + " sentences ###") return all_parser_sentence
def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).get_text()) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel = []
# if tag != "O": # print("%-12s"%tag, " ".join(w for w, t in chunk)) #b = eng_parser.parse("Rami Eid is studying at Stony Brook University in NY".split()) eng_parser = StanfordParser( r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar", r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar" ) #print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))) eng_parser = StanfordDependencyParser( r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar", r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar" ) res = list( eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) #for row in res[0].triples(): # print(row) trainfile = r'C:\Users\jingx\Dropbox\MSCF Course\NLP\NLP_Project\data\set1\a6.txt' with open(trainfile, encoding='utf8') as fin: train = fin.readlines() train = list(map(lambda x: x.strip('\n'), train)) train = list(map(lambda x: x.strip(' '), train)) train = ' '.join(train) sent_tokenize_list = sent_tokenize(train) NE = dict() for i in range(200, 240): #range(len(sent_tokenize_list)):
os.environ['JAVAHOME'] = java_path for each in range(1, len(sys.argv)): inputString += sys.argv[each] inputString += " " # inputString = raw_input("Enter the String to convert to ISL: ") parser = StanfordParser( model_path= 'D:/stanford-parser-full-2018-02-27/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) # o=parser.parse(s.split()) englishtree = [tree for tree in parser.parse(inputString.split())] parsetree = englishtree[0] dict = {} # "***********subtrees**********" parenttree = ParentedTree.convert(parsetree) for sub in parenttree.subtrees(): dict[sub.treeposition()] = 0 #"----------------------------------------------" isltree = Tree('ROOT', []) i = 0 for sub in parenttree.subtrees():
# find_entity_t = test.find_entity() # find_VP_t = test.firstVP() # test.drawTree() test.show(firstNP_t) # test.show(find_entity_t) # test.show(find_VP_t) # # test.show(find_entity_t) # test.show(firstMinNP_t) result = test.find_realtionship(firstNP_t) print(result) test.drawTree() # # # print(test.rel) # test.show(test.find_realtionship()) # 对比实验 chi_parser = StanfordParser(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') data_dir='../stanford-segmenter-2018-02-27/' segmenter = StanfordSegmenter(path_to_jar=data_dir+"stanford-segmenter-3.9.1.jar", path_to_sihan_corpora_dict=data_dir+"/data", path_to_model=data_dir+"/data/pku.gz", path_to_dict=data_dir+"/data/dict-chris6.ser.gz", java_class='edu.stanford.nlp.ie.crf.CRFClassifier', ) result=segmenter.segment(test_str) result_ls = result.split() ch_tree = list(chi_parser.parse(result_ls))[0] ch_tree.draw() # print(result)
from nltk.tokenize import StanfordSegmenter # from nltk.tokenize import StanfordTokenizer segmenter = StanfordSegmenter( path_to_sihan_corpora_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data", path_to_model="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/pku.gz", path_to_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz") res = segmenter.segment(u'北海已成为中国对外开放中升起的一颗明星') print(type(res)) print(res.encode('utf-8')) from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) for tree in res: print(tree) tree.draw() ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz') res1 = list(ch_parser.parse(u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'.split())) for tree in res1: print(tree) tree.draw() from nltk.parse.stanford import StanfordDependencyParser eng_parser = StanfordDependencyParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res2 = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) # for row in res2[0].triples():
stop_words = stopwords.words('english') eng_parser = StanfordParser( model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') eng_parser.java_options = '-mx3000m' # print('=' * 100) # print('current tag: {}, file idx: {}'.format(tag, idx)) '''POS''' print('=' * 100) print('Starting POS...') pos_sent = [] # for sent in tqdm(sentences): # pos_sent.append(list(eng_parser.parse( # [w for w in sent.split()]))[0]) for sent in tqdm(sentences): pos_sent.append(list(eng_parser.parse([w for w in sent.split()[2:] ]))[0]) # ignore first two word '''filter noun phrase & NLTK stemming''' # print('=' * 100) # print('Starting filter and stemming...') # cleaned_sent = [] # for sent in tqdm(pos_sent): # wnl = WordNetLemmatizer() # tmp_sent = [] # for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'): # '''clean stop words & stemming''' # tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words] # '''length <= 3 & filter repeated list''' # if 0 < len(tmp) <= 3 and tmp not in tmp_sent: # tmp_sent.append(tmp) # cleaned_sent.append(tmp_sent) '''save file'''
# encoding: utf-8 import nltk from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.parse.stanford import StanfordParser segmenter=StanfordSegmenter( #分词依赖的jar包 path_to_jar=r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar", path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar", #分词数据文件夹 path_to_sihan_corpora_dict=r"/home/jiangix/document/stanford-segmenter/data", #基于北大在2005backoof上提供的人名日报语料库 path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz", path_to_dict=r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz" ) segmenter.default_config('zh') result=segmenter.segment(u'我喜欢学习编程') chi_parser = StanfordParser( r"/home/jiangix/document/stanford-parser/stanford-parser.jar", r"/home/jiangix/document/stanford-parser/stanford-parser-3.8.0-models.jar", r"/home/jiangix/document/stanford-parser/chinesePCFG.ser.gz") sentences=chi_parser.parse(result.split()) for sentence in sentences: sentence.draw()
def ch_parser(sent): chi_parser = StanfordParser( r"E:\tools\stanfordNLTK\jar\stanford-parser.jar", r"E:\tools\stanfordNLTK\jar\stanford-parser-3.9.1-models.jar", r"E:\tools\stanfordNLTK\jar\classifiers\chinesePCFG.ser.gz") print(list(chi_parser.parse(sent.split())))
def func(): r = sr.Recognizer() isl_gif = [ 'all the best', 'any questions', 'are you angry', 'are you busy', 'are you hungry', 'are you sick', 'be careful', 'can we meet tomorrow', 'did you book tickets', 'did you finish homework', 'do you go to office', 'do you have money', 'do you want something to drink', 'do you want tea or coffee', 'do you watch TV', 'dont worry', 'flower is beautiful', 'good afternoon', 'good evening', 'good morning', 'good night', 'good question', 'had your lunch', 'happy journey', 'hello what is your name', 'how many people are there in your family', 'i am a clerk', 'i am bore doing nothing', 'i am fine', 'i am sorry', 'i am thinking', 'i am tired', 'i dont understand anything', 'i go to a theatre', 'i love to shop', 'i had to say something but i forgot', 'i have headache', 'i like pink colour', 'i live in nagpur', 'lets go for lunch', 'my mother is a homemaker', 'my name is john', 'nice to meet you', 'no smoking please', 'open the door', 'please call an ambulance', 'please call me later', 'please clean the room', 'please give me your pen', 'please use dustbin dont throw garbage', 'please wait for sometime', 'shall I help you', 'shall we go together tommorow', 'sign language interpreter', 'sit down', 'stand up', 'take care', 'there was traffic jam', 'wait I am thinking', 'what are you doing', 'what is the problem', 'what is todays date', 'what is your age', 'what is your father do', 'what is your job', 'what is your mobile number', 'what is your name', 'whats up', 'when is your interview', 'when we will go', 'where do you stay', 'where is the bathroom', 'where is the police station', 'you are wrong', 'address', 'agra', 'ahemdabad', 'all', 'april', 'assam', 'august', 'australia', 'badoda', 'banana', 'banaras', 'banglore', 'bihar', 'bihar', 'bridge', 'cat', 'chandigarh', 'chennai', 'christmas', 'church', 'clinic', 'coconut', 'crocodile', 'dasara', 'deaf', 'december', 'deer', 'delhi', 'dollar', 'duck', 'febuary', 'friday', 'fruits', 'glass', 'grapes', 'gujrat', 'hello', 'hindu', 'hyderabad', 'india', 'january', 'jesus', 'job', 'july', 'july', 'karnataka', 'kerala', 'krishna', 'litre', 'mango', 'may', 'mile', 'monday', 'mumbai', 'museum', 'muslim', 'nagpur', 'october', 'orange', 'pakistan', 'pass', 'police station', 'post office', 'pune', 'punjab', 'rajasthan', 'ram', 'restaurant', 'saturday', 'september', 'shop', 'sleep', 'southafrica', 'story', 'sunday', 'tamil nadu', 'temperature', 'temple', 'thursday', 'toilet', 'tomato', 'town', 'tuesday', 'usa', 'village', 'voice', 'wednesday', 'weight' ] arr = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] with sr.Microphone() as source: r.adjust_for_ambient_noise(source) i = 0 while True: print('Say something') audio = r.listen(source) # recognize speech using Sphinx try: a = r.recognize_google(audio) print("you said " + a.lower()) inputString = a.lower() parser = StanfordParser( model_path= 'D:/stanford-parser-full-2018-02-27/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) o = parser.parse(inputString.split()) #print(o) englishtree = [ tree for tree in parser.parse(inputString.split()) ] parsetree = englishtree[0] dict = {} # "***********subtrees**********" parenttree = ParentedTree.convert(parsetree) for sub in parenttree.subtrees(): dict[sub.treeposition()] = 0 #"----------------------------------------------" isltree = Tree('ROOT', []) i = 0 for sub in parenttree.subtrees(): if (sub.label() == "NP" and dict[sub.treeposition()] == 0 and dict[sub.parent().treeposition()] == 0): dict[sub.treeposition()] = 1 isltree.insert(i, sub) i = i + 1 if (sub.label() == "VP" or sub.label() == "PRP"): for sub2 in sub.subtrees(): if ((sub2.label() == "NP" or sub2.label() == 'PRP') and dict[sub2.treeposition()] == 0 and dict[sub2.parent().treeposition()] == 0): dict[sub2.treeposition()] = 1 isltree.insert(i, sub2) i = i + 1 for sub in parenttree.subtrees(): for sub2 in sub.subtrees(): # print sub2 # print len(sub2.leaves()) # print dict[sub2.treeposition()] if (len(sub2.leaves()) == 1 and dict[sub2.treeposition()] == 0 and dict[sub2.parent().treeposition()] == 0): dict[sub2.treeposition()] = 1 isltree.insert(i, sub2) i = i + 1 parsed_sent = isltree.leaves() words = parsed_sent stop_words = set(stopwords.words("english")) # print stop_words lemmatizer = WordNetLemmatizer() ps = PorterStemmer() lemmatized_words = [] for w in parsed_sent: # w = ps.stem(w) lemmatized_words.append(lemmatizer.lemmatize(w)) islsentence = "" print("According to ISL:") print(lemmatized_words) for w in lemmatized_words: if w not in stop_words: islsentence += w islsentence += " " l = islsentence.split(" ") #print(l) t = set(l) l1 = list(t) #print(l1) #print("") while ("" in l1): l1.remove("") print(l1) str = " " str = str.join(l1) print("Output:") print(str) #print(islsentence) for c in string.punctuation: a = a.replace(c, "") if (str == 'done'): print("oops!Time To say good bye") break elif (a.lower() in isl_gif): class ImageLabel(tk.Label): """a label that displays images, and plays them if they are gifs""" def load(self, im): if isinstance(im, a.lower()): im = Image.open(im) self.loc = 0 self.frames = [] try: for i in count(1): self.frames.append( ImageTk.PhotoImage(im.copy())) im.seek(i) except EOFError: pass try: self.delay = im.info['duration'] except: self.delay = 100 if len(self.frames) == 1: self.config(image=self.frames[0]) else: self.next_frame() def unload(self): self.config(image=None) self.frames = None def next_frame(self): if self.frames: self.loc += 1 self.loc %= len(self.frames) self.config(image=self.frames[self.loc]) self.after(self.delay, self.next_frame) root = tk.Tk() lbl = ImageLabel(root) lbl.pack() lbl.load(r'C:/Users/shree/ISL/ISL_Gifs/{0}.gif'.format( a.lower())) root.mainloop() else: for i in range(len(a)): #a[i]=a[i].lower() if (a[i] in arr): ImageAddress = 'letters/' + islsentence[i] + '.jpg' ImageItself = Image.open(ImageAddress) ImageNumpyFormat = np.asarray(ImageItself) plt.imshow(ImageNumpyFormat) plt.draw() plt.pause(1) # pause how many seconds #plt.close() else: continue except: print("Could not listen") plt.close()
import os, sys os.environ[ 'CLASSPATH'] = '/home/Aaditya/assignments/Project/stanford-parser-full-2015-12-09/' from nltk.parse.stanford import StanfordParser parser = StanfordParser() question = sys.argv[1].strip('?').lower() tree = list(parser.parse(question.split()))[0] person = ' '.join( list(tree.subtrees( filter=lambda x: x.label() == 'NP'))[0].leaves()).lower() l = list(tree.subtrees(filter=lambda x: x.label() == 'VP')) if len(l) == 0 and 'who' in question: what = 'who' else: what = ' '.join(l[0].leaves()).lower() print(person) print(what) from get_results import query if 'star' in what or 'appear' in what: sparql = open('./films.sparql').read() sparql = sparql.replace('[[name]]', person) query(sparql)
def en_parser(str): # 待处理 eng_parser = StanfordParser( r"E:\tools\stanfordNLTK\jar\stanford-parser.jar", r"E:\tools\stanfordNLTK\jar\stanford-parser-3.9.1-models.jar", r"E:\tools\stanfordNLTK\jar\classifiers\englishPCFG.ser.gz") print(list(eng_parser.parse(str.split())))
import os from nltk.parse.stanford import StanfordParser from nltk.parse.stanford import StanfordDependencyParser os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar' os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar' os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home' eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))) a = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))[0] a.draw() eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) for row in res[0].triples(): print(row) res[0].tree().draw()
# for word, tag in chi_tagger.tag(sent.split()): # print word.encode('utf-8'), tag # # # 英文词性标注 from nltk.tag import StanfordPOSTagger # eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') # print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # # 中文词性标注 chi_tagger = StanfordPOSTagger('chinese-distsim.tagger') # sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续' for _, word_and_tag in chi_tagger.tag(sent.split()): word, tag = word_and_tag.split('#') print word.encode('utf-8'), tag # 中英文句法分析 区别在于词库不同 from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split()) for tree in eng_parser.parse(sent): tree.pprint() # 依存关系分析 from nltk.parse.stanford import StanfordDependencyParser eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') res = list(eng_parser.parse(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split())) # st(context=21) for row in res[0].triples(): print '(' + row[0][0] + ',' + row[0][1] + ')', row[1], '(' + row[2][0] + ',' + row[2][1] + ')'
def animation_view2(request): if request.method == 'POST': text = request.POST.get('sen') #tokenizing the sentence text.lower() #tokenizing the sentence words = word_tokenize(text) print(words) parser = StanfordParser( model_path= 'C:/Users/Shree/Downloads/CS 753/project/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) englishtree = [tree for tree in parser.parse(text.split())] parsetree = englishtree[0] dict = {} parenttree = ParentedTree.convert(parsetree) for sub in parenttree.subtrees(): dict[sub.treeposition()] = 0 # ------------------- isltree = Tree('ROOT', []) i = 0 for sub in parenttree.subtrees(): if (sub.label() == "NP" and dict[sub.treeposition()] == 0 and dict[sub.parent().treeposition()] == 0): dict[sub.treeposition()] = 1 isltree.insert(i, sub) i += 1 if (sub.label() == "VP" or sub.label() == "PRP"): for sub2 in sub.subtrees(): if ((sub2.label() == "NP" or sub2.label() == 'PRP') and dict[sub2.treeposition()] == 0 and dict[sub2.parent().treeposition()] == 0): dict[sub2.treeposition()] = 1 isltree.insert(i, sub2) i = i + 1 # --------------------- for sub in parenttree.subtrees(): for sub2 in sub.subtrees(): if (len(sub2.leaves()) == 1 and dict[sub2.treeposition()] == 0 and dict[sub2.parent().treeposition()] == 0): dict[sub2.treeposition()] = 1 isltree.insert(i, sub2) i = i + 1 parsed_sent = isltree.leaves() words = parsed_sent stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() ps = PorterStemmer() lemmatized_words = [] # print(parsed_sent) for w in parsed_sent: lemmatized_words.append(lemmatizer.lemmatize(w)) islsentence = "" # print(lemmatized_words) filtered_text = [] for w in lemmatized_words: if w not in stop_words: filtered_text.append(w) islsentence += w islsentence += " " # print(islsentence) words = filtered_text print(words) filtered_text = [] for w in words: path = w + ".mp4" f = finders.find(path) #splitting the word if its animation is not present in database if not f: for c in w: filtered_text.append(c) #otherwise animation of word else: filtered_text.append(w) words = filtered_text return render(request, 'animation2.html', { 'words': words, 'text': text }) else: return render(request, 'animation2.html')
# Download Stanford Parser:https://nlp.stanford.edu/software/lex-parser.shtml#Download; unzip # # Or download Stanford Parser: https://cloud.tsinghua.edu.cn/d/095d08f52f504f32b40d/; unzip # # Required runtime enviroment for mac also avaible at https://cloud.tsinghua.edu.cn/d/095d08f52f504f32b40d/ # from nltk.parse.stanford import StanfordParser import os # set environment variables to the path to your Stanford Parser os.environ['STANFORD_PARSER'] = '/Users/baixiaojing/StanfordNLP/stanford-parser-full-2017-06-09/stanford-parser.jar' os.environ['STANFORD_MODELS'] = '/Users/baixiaojing/StanfordNLP/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar' # choose the model for your parser eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') # parse the sentence parse = eng_parser.parse("Can you book a flight to London?".split()) # form a tree tree = list(parse)[0] # draw a tree tree.draw()
from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser() print( list( eng_parser.parse( "the quick brown fox jumps over the lazy dog".split())))
import nltk import os from nltk.parse.stanford import StanfordParser from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger from nltk.tokenize.stanford import StanfordTokenizer from nltk.tree import * from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer s = raw_input("Enter string") parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") o = parser.parse(s.split()) tree1 = [tree for tree in parser.parse(s.split())] parsetree = tree1[0] dict = {} #output = '(ROOT (S (PP (IN As) (NP (DT an) (NN accountant))) (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))))' #parsetree=Tree.fromstring(output) #parsetree=parser.raw_parse(s) print parsetree print "***********subtrees**********" ptree = ParentedTree.convert(parsetree) for sub in ptree.subtrees(): #print sub dict[sub.treeposition()] = 0 # print sub.label() print "----------------------------------------------"
#中文词性标注 chi_tagger=StanfordPOSTagger(model_filename='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/models/chinese-distsim.tagger', path_to_jar='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/stanford-postagger.jar') print(chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。'.split())) #英文句法分析 #import os #java_path='/usr/lib/jvm/jdk/jdk1.8.0_121' #os.environ['JAVAHOME']=java_path from nltk.internals import find_jars_within_path from nltk.parse.stanford import StanfordParser eng_parser=StanfordParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar', '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar', '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz') eng_parser.__classpath=tuple(find_jars_within_path('/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/')) print (list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))) #英文依存句法分析 from nltk.parse.stanford import StanfordDependencyParser eng_parser=StanfordDependencyParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar', '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar', '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz') res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) print (res[0]) for row in res[0].triples(): print(row)
def test_chinese_parser(): sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星' chi_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') print list(chi_parser.parse(sent.split()))
def main(): # stanford_pos_dir = '/Users/yuyanzhang/Desktop/CMU/NLP/project/tools/stanford-postagger-full-2015-04-20/' # eng_model_filename= stanford_pos_dir + 'models/english-bidirectional-distsim.tagger' # my_path_to_jar= stanford_pos_dir + 'stanford-postagger.jar' # st = StanfordPOSTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar) # print(st.tag('What is the airspeed of an unladen swallow ?'.split())) # # NER Tagging: stanford_ner = '/Users/yuyanzhang/Desktop/CMU/NLP/project/tools/stanford-ner-2015-04-20/' # stanford_ner_model = stanford_ner + 'classifiers/english.all.3class.distsim.crf.ser.gz' stanford_ner_model = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz' stanford_ner_jar = stanford_ner + 'stanford-ner.jar' ner = StanfordNERTagger(model_filename=stanford_ner_model, path_to_jar=stanford_ner_jar) #print(ner.tag('Rami Eid is studying at Stony Brook University in NY'.split())) # Set up the stanford PCFG parser stanford_parser_dir = '/Users/yuyanzhang/Desktop/CMU/NLP/project/tools/stanford-parser-full-2015-04-20/' eng_model_path = stanford_parser_dir + "stanford-parser-3.5.2-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" my_path_to_models_jar = stanford_parser_dir + "stanford-parser-3.5.2-models.jar" my_path_to_jar = stanford_parser_dir + "stanford-parser.jar" parser=StanfordParser(model_path=eng_model_path, path_to_models_jar=my_path_to_models_jar, path_to_jar=my_path_to_jar) # sent = "Seth Kramer, one of the directors, describes how he first got the idea for The Linguists when in Vilnius, Lithuania, he could not read Yiddish inscriptions on a path in spite of his Jewish heritage." # parser_result =(parser.parse("The random person on the street eats meat.".split())) # for a in parser_result: # getNodes(a) # print("\u") # # Read in the article and list of questions article_path = sys.argv[1] question_list = sys.argv[2] # Tokenize all sentences sentences_pool = [] article = open(article_path).read() paragraphs = [p for p in article.split('\n') if p] for paragraph in paragraphs[1:len(paragraphs)]: # Skip the title # sentences = sent_tokenize(paragraph) sentences = sent_tokenize(paragraph.decode('utf-8')) for sentence in sentences: sentence_tokenized = [a.lower() for a in word_tokenize(sentence)] sentences_pool.append(sentence_tokenized) # Answer questions in the quesiton list count = 0 # Read in the lemmatized the sentences # sentences_pool_lemmatized = [] # Uncomment if the lemmatized sentence pool hasn't been generated yet # This step takes a long time, so you only need to run lemmatizaiton onece and you can load # the lemmatized setences from file to try different things after # with open('sentences_pool_lemmatized.csv','w') as f: # writer = csv.writer(f,delimiter="\t") # for sent in sentence_pool: # sent = [a for a in sent if a != '\t'] # sentences_lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(sent)] # sentences_pool_lemmatized.append(sentences_lemmatized) # writer.writerow(sentences_lemmatized) # with open('sentences_pool_lemmatized.csv') as f: # for line in f: # line = [a.lower() for a in line.strip().split("\t")] # sentences_pool_lemmatized.append(line) with open(question_list) as f: # For each question on the list for question in f: count += 1 question_tokenized = word_tokenize(question) question_tokenized_lower = [a.lower() for a in question_tokenized] question_start = question_tokenized_lower[0] # # Control the type of question # if question_start in ['when','where']: # pass # else: # continue # Seperate question words and question content # filtered_list = [a for a in string.punctuation] filtered_list = ['?','when', 'what','where','what','why','which','who','how','do','does','did','a','the','an'] question_content = [a for a in question_tokenized_lower if a not in filtered_list] # Lemmatize the question #question_lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(question_content)] # Find the most similar sentences in the pool max_similarity = None most_similar_sent = [] # We need to consider ties for sent_idx in range(len(sentences_pool)): sent = sentences_pool[sent_idx] #similarity_score = jaccard_similarity(sent,question_content)+similarity(sent,question_content) if question_start == 'why': similarity_score = similarity_why(sent,question_content) else: similarity_score = similarity(sent,question_content, 0.8) if max_similarity == None: max_similarity = similarity_score # Append the origin un-lemmatized sentence most_similar_sent.append(sentences_pool[sent_idx]) elif similarity_score > max_similarity: max_similarity = similarity_score most_similar_sent.append(sentences_pool[sent_idx]) else: pass # print((most_similar_sent)) # print # Now, build the answer from the retrieved sentence same_word = set(most_similar_sent[0]) for s in most_similar_sent[1:]: same_word.intersection_update(s) # Find the most relevant sentence max_similarity_2 = None max_similar_sent = None for sent in most_similar_sent: sent_filtered = [a for a in sent if not a in same_word] similarity_socre_2 = similarity(sent_filtered,question_content, 1) if max_similarity_2 == None: max_similarity_2 = similarity_socre_2 max_similar_sent = sent elif similarity_socre_2 > max_similarity_2: max_similarity_2 = similarity_socre_2 max_similar_sent = sent # print(max_similar_sent) # Build answer based on different type of question answer = "NULL" try: # Yes/No question: answer should contain only yes or no. if question_start in ["is","was","are","were","do","does","did","have","has","had", "wasn't","isn't","aren't"]: # First, convert sentence into a declarative sentence if max_similarity_2 == 0: answer = "No" else: question_parse = parser.parse(question_tokenized) for parse in question_parse: # print(parse) verb = parse[0][0].leaves() sub = (parse[0][1].leaves()) obj = (parse[0][2].leaves()) #substring = " ".join((sub+verb+obj)) # If yes, most of the words in objects should be in the original sentence obj = [a.lower() for a in obj] if float(len(intersection(obj,max_similar_sent))) / len(obj) >= 0.8: answer = "Yes" else: answer = "No" # TODO: parse candidate sentence # answer = "No" # similar_sent_parse = parser.parse(max_similar_sent) # for parse in similar_sent_parse: # verb_ = parse[0][0].leaves() # sub_ = (parse[0][2].leaves()) # obj_ = (parse[0][1].leaves()) elif question_start == 'why': max_similar_sent_str = " ".join(max_similar_sent) reason_idx = max_similar_sent_str.index('because of') answer = max_similar_sent_str[len('because of'):len(max_similar_sent_str)] if reason_idx == -1: reason_idx = max_similar_sent_str.index('because') answer = max_similar_sent_str[len('because'):len(max_similar_sent_str)] if reason_idx == -1: reason_idx = max_similar_sent_str.index('for') answer = max_similar_sent_str[len('for'):len(max_similar_sent_str)] if reason_idx == -1: answer = "NULL" elif question_start == 'when': # 1. Tag: 'DATE', 'TIME' # 2.1. one PP or one CD in PP, return it # 2.2. multi candidate, return max_similar_sent found_DATE = False max_similar_sent_tag = ner.tag(max_similar_sent) # print max_similar_sent_tag # Uncomment for dry run for pair in max_similar_sent_tag: if pair[1] == 'DATE' or pair[1] == 'TIME': answer = pair[0] found_DATE = True if not found_DATE: #TODO: deal with this situation max_similar_parse = parser.parse(max_similar_sent) for mparse in max_similar_parse: #print mparse stack = mparse answer = max_similar_parse record1 = [] record2 = [] for i in stack: searchLabel(i, "PP", record1) # print "-------", record1 if len(record1) == 1: answer = record1[0].leaves() else: for j in record1: searchLabel(j, "CD", record2) if len(record2) == 1: answer = record2[0].leaves() elif question_start == 'who': max_similar_sent_tag = ner.tag(max_similar_sent) found_PERSON = False for pair in max_similar_sent_tag: if pair[1] == 'PERSON': answer = pair[0] found_PERSON = True if not found_PERSON: #TODO: deal with this situation pass elif question_start == 'where': found_LOCATION = False max_similar_sent_tag = ner.tag(max_similar_sent) for pair in max_similar_sent_tag: if pair[1] == 'LOCATION' or pair[1] == 'LOCATION': answer = pair[0] found_LOCATION = True if not found_LOCATION: max_similar_parse = parser.parse(max_similar_sent) for mparse in max_similar_parse: #print mparse stack = mparse answer = max_similar_sent record1 = [] record2 = [] for i in stack: searchLabel(i, "PP", record1) # print "-------", record1 if len(record1) == 1: if record1[0][0][0] in ("in", "from", "at", "on", "under"): answer = record1[0].leaves() else: for j in record1: searchLabel(j, "CD", record2) if len(record2) == 1: answer = record2[0].leaves() elif question_start == 'how': question_second = question_tokenized_lower[1] temp = ['old', 'long', 'many', 'much', 'tall', 'heavy'] max_similar_sent_str = " ".join(max_similar_sent) if question_second not in temp: answer = max_similar_sent_str else: number = [int(s) for s in max_similar_sent_str.split() if s.isdigit()] tagged = pos_tag(max_similar_sent) token_candidates = [] for token, label in tagged: splited = token.split('-') if len(splited) > 1: for t in splited: if t.isdigit(): token_candidates.append(t) if label == 'CD': token_candidates.append(token) if len(token_candidates) > 1: answer = max_similar_sent_str elif len(token_candidates) == 1: answer = token_candidates[0] else: answer = "NULL" #For what, which,and others else: #print(count,question) try: question_parse = parser.parse(question_tokenized) for parse in question_parse: #print(parse) verb = parse[0][1].leaves() sub = (parse[0][1][1].leaves()) #obj = (parse[0][2].leaves()) #print(verb,sub) similar_sent_parse = parser.parse(max_similar_sent) for parse in similar_sent_parse: # print(parse) answer = parse[0][1][1].leaves() except: pass #TODO: deal with this situation #Capitalize first letter if not answer: answer = "NULL" elif question_start == 'how': answer = answer.capitalize() else: answer = " ".join(answer) a = list(answer) if a: a[0] = a[0].upper() answer = "".join(a) print(' '.join(question_tokenized)) print(answer) except: print(" ".join(max_similar_sent))