def main_test3(): parser = CoreNLPDependencyParser(url='http://localhost:9001') # parser = CoreNLPParser('http://localhost:9001') # print(list(parser.tokenize(u'中国人为什么不能选择安乐死?'))) # ['中国人', '为什么', '不', '能', '选择', '安乐死', '?'] # title = '中国人为什么不能选择安乐死?' title = '我支持我觉得安乐死挺好的。' for word in parser.tokenize(title): print(word.lemma)
def analyze(stext): parser = CoreNLPDependencyParser(url="http://localhost:9000") if '\r\n' in stext: stext = stext.replace('\r\n', ' ') iterator = parser.raw_parse(stext) parse = next(iterator) parse = add_offset_to_tree(parse, stext) return parse
def main(): parser = CoreNLPDependencyParser(url='http://localhost:9001') title = '中国人为什么不能选择安乐死?' # parse, = parser.raw_parse(title) title_sub = find_NN(title, parser) # tokenize # title_token = list(parser.tokenize(title)) # print(title_token) # parse danmaku and extract subjects # using windows : surrounding polarity data = pd.read_csv('demo.csv') # danmaku_list = data.loc[:, 'Barrages_original'] # get index of 1. INITIAL 2. middle 3. end # for now: the same padding; initial_time = 10.0 # what's the inital time duration?: initial 10 seconds start_idx = 0 for showing_time in data['Showing_time']: if showing_time < initial_time: start_idx += 1 # slice danmaku list token = 2 initial_dan = data.loc[:start_idx, 'Barrages_original'] ini_subj = padding_initial(title_sub, initial_dan, parser, token) last_data = data[start_idx + 1:] # subjects = [] # padding danmaku # processing padding on different timezone last_subjects = process_padding(last_data, parser, title_sub, token, data)[0] subjects = ini_subj + last_subjects pprint(subjects) print(f'length of subjects: {len(subjects)}') sur_words = process_padding(last_data, parser, title_sub, token, data)[1] # polarity, according to surrounding words to define the semantic po_list = [] for sur in sur_words: po = return_polarity(sur) print(f'{sur}: {po}') po_list.append(po) print(len(po_list))
def main(): parser = CoreNLPDependencyParser(url='http://localhost:9001') title = '中国人为什么不能选择安乐死?' # parse, = parser.raw_parse(title) title_sub = find_NN(title, parser) # parse danmaku and extract subjects # using windows : surrounding polarity data = pd.read_csv('demo.csv') # get index of 1. INITIAL 2. middle 3. end # for now: the same padding; initial_time = 10.0 # what's the inital time duration?: initial 10 seconds start_idx = 0 for showing_time in data['Showing_time']: if showing_time < initial_time: start_idx += 1 # slice danmaku list: initial + last, get subjects:list of lists initial_dan = data.loc[:start_idx, 'Barrages_original'] ini_subj = padding_initial(title_sub, initial_dan, parser) last_data = data[start_idx + 1:] # padding danmaku # processing padding on different timezone last_subjects = process_padding_last(last_data, parser, title_sub, data) subjects = ini_subj + last_subjects # mapping and filter pair_idxs = map_subjects(subjects) # enumerate all of the danmaku and pairlying output opinion distances row_length = data.shape[0] # how many rows m = np.array([range(row_length)]) distance.cdist( m, m, lambda u, v: op_distance(u, v, data=data, parser=parser, subjects=subjects, pair_idxs=pair_idxs))
def split_offset(offset): if ';' in offset: offset = offset.split(";") else: offset = [offset] ent_offset = [] for off in offset: ent_offset.append(tuple([int(i) for i in off.split("-")])) return np.asarray(ent_offset) if __name__ == '__main__': input_directory = '../data/Devel/' parser = CoreNLPDependencyParser(url="http://localhost:9000") sentences = {} all_entities = {} all_roots = {} all_parse = {} # Process each file in the directory for index_file, filename in enumerate(os.listdir(input_directory)): # Parse XML file root = parse_xml(input_directory + filename) print(" - File:", filename, "(", index_file + 1, "out of ", len(os.listdir(input_directory)), ")") for child in root: sid, text = get_sentence_info(child)
def main(): parser = CoreNLPDependencyParser(url='http://localhost:9001') title = '中国人为什么不能选择安乐死?' # parse, = parser.raw_parse(title) title_sub = find_NN(title, parser) # parse danmaku and extract subjects # using windows : surrounding polarity data = pd.read_csv('../temp/demo.csv') # get index of 1. INITIAL 2. middle 3. end # for now: the same padding; initial_time = 10.0 # what's the inital time duration?: initial 10 seconds start_idx = 0 for showing_time in data['Showing_time']: if showing_time < initial_time: start_idx += 1 # slice danmaku list: initial + last, get subjects:list of lists initial_dan = data.loc[:start_idx, 'Barrages_original'] ini_subj = padding_initial(title_sub, initial_dan, parser) last_data = data[start_idx + 1:] # padding danmaku # processing padding on different timezone last_subjects = process_padding_last(last_data, parser, title_sub, data) subjects = ini_subj + last_subjects pair_idxs = map_subjects(subjects) # mapping sentence index map_sens = mappingsen(pair_idxs) # enumerate all of the danmaku and pairlying output opinion distances row_length = data.shape[0] # how many rows count_zero_dis = 0 # how many distances are 0 between sentences? count_zero_dis1 = 0 # save res into table: # first sentence; second sentence; opinion distance sentence_0_list = [] sentence_1_list = [] op_dis_list = [] df = pd.DataFrame(columns=('first sentence', 'second sentence', 'op_distance')) with open('distance.txt', 'w') as f: import time start_t = time.time() for m in range(row_length): n = m + 1 while n < row_length: if (m, n) in map_sens: sentence_0_list.append(m) sentence_1_list.append(n) op_d = op_distance(subjects, m, n, data, parser, pair_idxs) op_dis_list.append(op_d) count_zero_dis1 += 1 f.write( f'the distance between danmaku {m} and danmaku {n} is: {op_d}\n' ) else: sentence_0_list.append(m) sentence_1_list.append(n) f.write( f'the distance between danmaku {m} and danmaku {n} is: 0.0 \n' ) op_dis_list.append(0.0) count_zero_dis += 1 n += 1 end_t = time.time() print(f'it takes {end_t-start_t} to run 104 danmakus') print(f'total zero distance: {count_zero_dis}') print(f'total non-zero distance: {count_zero_dis1}') print(len(op_dis_list)) df['first sentence'] = sentence_0_list df['second sentence'] = sentence_1_list df['op_distance'] = op_dis_list df.to_csv('distance_table.csv', index=False)
if index == 0: index += 1 continue index += 1 word_vector = line.split(" ") word = word_vector[0] vector_list = [] for element in word_vector[1:]: vector_list.append(float(element)) vector = np.asarray(vector_list) dic[word] = vector # connect to coreNLP server dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') def generate_result(sents, predictions): COLOR = ['orange', 'violet'] sentences = [] # print(sents) # print(predictions) for i in range(len(sents)): print(predictions[i]) sentence = "" for j in range(len(sents[i])): if predictions[i][j] == '0': sentence += sents[i][j] sentence += " " elif predictions[i][j] == '1' or predictions[i][j] == '2':
from nltk import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') out_file = open('./data_semEval/laptop/raw_parse_laptop_test', 'w') in_file = open('./data_semEval/laptop/laptop_test.txt', 'r') for line in in_file: parse, = dep_parser.raw_parse(line) out_file.write(parse.to_conll(4)) out_file.write('\n') in_file.close() out_file.close()