def merge_subject_sentiment_value(subject_file_path, sentiment_file_path, result_file_path): """convert_subject_sentiment_value_predict_result""" subject_file_lines = file_utils.read_all_lines(subject_file_path) sentiment_file_lines = file_utils.read_all_lines(sentiment_file_path) result = ['content_id,subject,sentiment_value,sentiment_word'] for i, subject_line in enumerate(subject_file_lines): subject_line_parts = subject_line.split(',') sentiment_value = sentiment_file_lines[i].split(',')[1] result.append(subject_line_parts[0] + ',' + subject_line_parts[2] + ',' + sentiment_value + ',') file_utils.write_lines(result, result_file_path)
def read_test_labels(file_path): """ Args: file_path: , """ lines = file_utils.read_all_lines(file_path) labels = [[int(label) for label in line.split(',')[1:]] for line in lines[1:]] return labels
def read_labels(file_path): """ Args: file_path: ,,label Returns: list, label """ lines = file_utils.read_all_lines(file_path) labels = [[int(label) for label in line.split(',')[0].split(' ')] for line in lines] return labels
def read_subject_train_ids(file_path): """ Args: file_path: ,,id Returns: list, id """ lines = file_utils.read_all_lines(file_path) ids = [line.split(',')[2] for line in lines] return ids
def read_features(file_path): """ Args: file_path: ,,,, Returns: list, """ lines = file_utils.read_all_lines(file_path) features = [line.split(',')[1] for line in lines] return features
def read_subject_of_sentiment_value(file_path): """ Args: file_path: ,, Returns: list, """ lines = file_utils.read_all_lines(file_path) features = [ line.split(',')[2] for line in lines] return features
def read_field(file_path, field_index, separator=',', has_head=True): """ Args: file_path: ,,id Returns: list, id """ lines = file_utils.read_all_lines(file_path) if has_head: lines = lines[1:] ids = [line.split(separator)[field_index] for line in lines] return ids
def convert_subject_sentiment_value_predict_result( subject_subject_sentiment_value_file_path, result_file_path): """convert_subject_sentiment_value_predict_result""" subject_sentiment_value_file_lines = file_utils.read_all_lines( subject_subject_sentiment_value_file_path) result = ['content_id,subject,sentiment_value,sentiment_word'] for i, subject_line in enumerate(subject_sentiment_value_file_lines): id_subjects = subject_line.split(',') subjects = id_subjects[1].split('|') for subject in subjects: result.append(id_subjects[0] + ',' + subject.replace('_', ',') + ',') file_utils.write_lines(result, result_file_path)
from nlp_tasks.absa.conf import data_path from nlp_tasks.absa.utils import file_utils train_data_lines = file_utils.read_all_lines(data_path.train_file_path)[1:] train_data_content_line_map = {} for train_data_line in train_data_lines: parts = train_data_line.split(',') if parts[1] not in train_data_content_line_map: train_data_content_line_map[parts[1]] = [] train_data_content_line_map[parts[1]].append(train_data_line) test_public_for_sentiment_lines = file_utils.read_all_lines( data_path.test_public_for_sentiment_value_file_path) result = ['content_id,subject,sentiment_value,sentiment_word'] in_train_data = set() in_train_data_for_submit = [] for test_public_for_sentiment_line in test_public_for_sentiment_lines: parts = test_public_for_sentiment_line.split(',') if parts[1] in train_data_content_line_map: if parts[1] not in in_train_data: in_train_data.add(parts[1]) in_train_data_samples = train_data_content_line_map[parts[1]] for in_train_data_sample in in_train_data_samples: in_train_data_sample_parts = in_train_data_sample.split(',') result.append(parts[0] + ',' + in_train_data_sample_parts[2] + ',' + in_train_data_sample_parts[3] + ',') in_train_data_for_submit.append(parts[0] + ',' + in_train_data_sample_parts[2] + ',' + in_train_data_sample_parts[3] + ',')
def preict(data, model): for i in range(len(data)): sample = data[i] parts = sample.split(',') text = parts[1] x_train = tokenizer.texts_to_sequences([text]) x_train_len = [len(element) for element in x_train] subject = parts[2] subject_repeated = data_utils.repeat_element_in_list([subject], x_train_len) x_subject = tokenizer.texts_to_sequences(subject_repeated) x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_subject = sequence.pad_sequences(x_subject, maxlen=maxlen) prob = model.predict([x_train, x_subject]) predict_label = result_utils.convert_sentiment_value_predict(prob) print(predict_label) train_data = file_utils.read_all_lines(train_file_path) test_data = file_utils.read_all_lines(test_file_path) print('train') train_sample = preict(train_data, model) print('test') test_sample = preict(test_data, model)
from nlp_tasks.absa.conf import data_path from nlp_tasks.absa.utils import file_utils in_train_data_for_submit = file_utils.read_all_lines( data_path.data_base_dir + 'in_train_data_for_submit') in_train_data_for_submit_id = [ line.split(',')[0] for line in in_train_data_for_submit ] result_file_name = 'test_public.result_20181028232554_caokong_xingneng.csv' result = file_utils.read_all_lines(data_path.data_base_dir + result_file_name) merge_result = [result.pop(0)] for line in result: id = line.split(',')[0] if id in in_train_data_for_submit_id: continue else: merge_result.append(line) merge_result.extend(in_train_data_for_submit) file_utils.write_lines( merge_result, data_path.data_base_dir + result_file_name + '.merge_result_and_in_train')