def POS_data(self): """POS sentences""" tag = 'pos' idx = 19 file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx) with open(file_name, 'r') as file: sentences = file.read().strip().split('\n') stop_words = stopwords.words('english') eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') eng_parser.java_options = '-mx3000m' print('=' * 100) print('current tag: {}, file idx: {}'.format(tag, idx)) '''POS''' print('=' * 100) print('Starting POS...') pos_sent = [] for sent in tqdm(sentences): pos_sent.append(list(eng_parser.parse( [w for w in sent.split()]))[0]) '''save file''' save_file = 'data/{}_sent/{}_sent_{}.csv'.format(tag, tag, idx) with open(save_file, mode='w') as file: for sent, pos in zip(sentences, pos_sent): file.write(sent + '\t') file.write(str(pos) + '\t') print('Finish! Saved in {}'.format(save_file))
def pos_test(): stop_words = stopwords.words('english') eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') eng_parser.java_options = '-mx3000m' sentence = "so now i run out take a few shot then run back in the house and xfer them to the pc" res = eng_parser.parse([w for w in sentence.lower().split()]) lst_res = list(res)[0] with open('data/tree_test.txt', 'w') as file: file.write(sentence + '\t') file.write(str(lst_res) + '\t') # print(lst_res) lst_res.pretty_print() # lst_res.remove(Tree('NN', ['camera'])) cleaned_sent = [] for sent in lst_res: wnl = WordNetLemmatizer() tmp_sent = [] for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'): '''clean stop words & stemming''' tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words] '''lenght <= 3 & filter repeated list''' if 0 < len(tmp) <= 3 and tmp not in tmp_sent: tmp_sent.append(tmp) cleaned_sent.append(tmp_sent) # get opinion word # for w in cleaned_sent[0]: # print(w) # words = sentence.split() # min_dist = len(words) # min_asp = w # for s in lst_res.subtrees(lambda t: t.label() == 'JJ'): # if abs(words.index(s.leaves()[0]) - words.index(w[0])) < min_dist: # min_dist = pos_test # min_asp = s.leaves()[0] # # if min_asp == w: # print('not found') # else: # print(min_asp) print(cleaned_sent)
def parser_nltk(word_lists, filename): os.environ['JAVAHOME'] = JAVA_PATH os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS chinese_parser = StanfordParser(model_path=nltk_parse_model_path) STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0] chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR)) chinese_parser.java_options = '-mx15000m' all_parser_sentence = [] file = shelve.open(filename) flag = 0 for sentence in word_lists: if sentence.strip() != "": res = list(chinese_parser.parse((sentence.strip()).split())) new_str = return_str_tofile(sentence_parse=str(res[0])) file[str(flag)] = res all_parser_sentence.append(new_str) flag += 1 print("###### NLTK Dependency Parser Have finished " + str(flag) + " sentences ###") return all_parser_sentence
'/home/{}/stanford/parser/stanford-parser.jar:' \ '/home/{}/stanford/parser/stanford-parser-3.9.2-models.jar'.format( user_name, user_name, user_name) # tag = 'pos' # idx = 19 # file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx) file_name = 'data/nor_clas.csv' with open(file_name, 'r') as file: sentences = file.read().strip().split('\n') stop_words = stopwords.words('english') eng_parser = StanfordParser( model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') eng_parser.java_options = '-mx3000m' # print('=' * 100) # print('current tag: {}, file idx: {}'.format(tag, idx)) '''POS''' print('=' * 100) print('Starting POS...') pos_sent = [] # for sent in tqdm(sentences): # pos_sent.append(list(eng_parser.parse( # [w for w in sent.split()]))[0]) for sent in tqdm(sentences): pos_sent.append(list(eng_parser.parse([w for w in sent.split()[2:] ]))[0]) # ignore first two word '''filter noun phrase & NLTK stemming''' # print('=' * 100)
split = ["trial", "train", "test_annotated"] for s in split: f = open("SICK_" + s + ".txt", "r") lines = f.readlines() sentences = [] labels = [] for i in range(1, len(lines)): a = lines[i].split("\t") sentences.extend([a[1], a[2]]) labels.extend([a[3], a[3]]) parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") stanford_dir = parser._classpath[0].rpartition('/')[0] parser._classpath = tuple(find_jars_within_path(stanford_dir)) parser.java_options = '-mx5000m' # To increase the amount of RAM it can use. #a=[parse.tree()._pformat_flat("","()",False) for parse in parser.raw_parse("The young boys are playing outdoors and the man is smiling nearby")] a = [[parse for parse in dep_graphs] for dep_graphs in parser.raw_parse_sents(sentences)] file = open("SICK_cons_parse_" + s + ".txt", "w") for i in range(len(a)): for j in range(len(a[i])): a[i][j].chomsky_normal_form(horzMarkov=1) a[i][j].collapse_unary(collapsePOS=True) d = a[i][j]._pformat_flat("", "()", False) sent1 = d.replace("ROOT", labels[i], 1) file.write(sent1 + "\n") file.close() f.close()