def __extra_examples_from_org_file__(self, org_file, des_filename): if file_tool.check_file(des_filename): examples_dicts = file_tool.load_data_pickle(des_filename) return examples_dicts example_dicts = [] rows = file_tool.read_tsv(org_file) for i, row in enumerate(rows): if i == 0: continue if i == 0: continue if len(row) != 6: raise RuntimeError example_temp = { 'qes_id1': int(row[1]), 'qes1': str(row[3]).strip(), 'qes_id2': int(row[2]), 'qes2': str(row[4]).strip(), 'label': int(row[5]), 'id': int(row[0]), } example_dicts.append(example_temp) file_tool.save_data_pickle(example_dicts, des_filename) return example_dicts
def train_model(self): train_loader_tuple_list = self.data_loader_dict[ 'train_loader_tuple_list'] avg_result = np.array([0, 0], dtype=np.float) record_list = [] for tuple_index, train_loader_tuple in enumerate( train_loader_tuple_list, 1): # if tuple_index>2: # break #repeat create framework, when each fold train self.create_framework() self.logger.info('{} was created!'.format(self.framework.name)) train_loader, valid_loader = train_loader_tuple self.logger.info('train_loader:{} valid_loader:{}'.format( len(train_loader), len(valid_loader))) self.logger.info('begin train {}-th fold'.format(tuple_index)) result = self.__train_fold__(train_loader=train_loader, valid_loader=valid_loader) self.trial_step = self.arg_dict['epoch'] * tuple_index avg_result += np.array(result[0:2], dtype=np.float) record_list.append(result[3]) record_file = file_tool.connect_path( self.framework.arg_dict['model_path'], 'record_list.pkl') file_tool.save_data_pickle(record_list, record_file) avg_result = (avg_result / len(train_loader_tuple_list)).tolist() avg_result.append('finish') self.logger.info('avg_acc:{}'.format(avg_result[0])) return avg_result
def __extra_sentences_from_org_file__(self, org_file, des_filename): if file_tool.check_file(des_filename): sentence_dict = file_tool.load_data_pickle(des_filename) return sentence_dict sentence_dict = {} rows = file_tool.load_data(org_file, mode='r') for i, row in enumerate(rows): result = row.split("\t") if i == 0: continue if len(result) != 6: raise RuntimeError if not general_tool.is_number(result[1]): raise RuntimeError if not general_tool.is_number(result[2]): raise RuntimeError if str(result[3]).strip() == '': print('empty sentence id:{}'.format(str(result[1]).strip())) else: sentence_dict[str(result[1]).strip()] = str(result[3]).strip() if str(result[4]).strip() == '': print('empty sentence id:{}'.format(str(result[2]).strip())) continue else: sentence_dict[str(result[2]).strip()] = str(result[4]).strip() file_tool.save_data_pickle(sentence_dict, des_filename) return sentence_dict
def get_qqp_obj(force=False): global single_qqp_obj if force or (single_qqp_obj is None): single_qqp_obj_file = file_tool.connect_path("corpus/qqp", 'qqp_obj.pkl') if file_tool.check_file(single_qqp_obj_file): single_qqp_obj = file_tool.load_data_pickle(single_qqp_obj_file) else: single_qqp_obj = Qqp() file_tool.save_data_pickle(single_qqp_obj, single_qqp_obj_file) return single_qqp_obj
def show_pared_info(self): print('the count of dep type:{}'.format( self.parse_info.dependency_count)) print( 'the max len of sentence_tokens:{}, correspond sent id:{}'.format( self.parse_info.max_sent_len, self.parse_info.max_sent_id)) print('the average len of sentence_tokens:{}'.format( self.parse_info.avg_sent_len)) sent_len_table = self.parse_info.sent_len_table file_tool.save_data_pickle( sent_len_table, file_tool.connect_path(self.data_path, "sent_len_table.pkl")) plt.bar(range(1, len(sent_len_table) + 1), sent_len_table) plt.title("sentence tokens length distribution") plt.show()
def parse_sentences(self): parsed_sentence_org_file = 'corpus/mrpc/parsed_sentences.txt' parsed_sentence_dict_file = 'corpus/mrpc/parsed_sentence_dict.pkl' if file_tool.check_file(parsed_sentence_dict_file): parsed_sentence_dict = file_tool.load_data_pickle( parsed_sentence_dict_file) else: parsed_sentence_dict = parser_tool.extra_parsed_sentence_dict_from_org_file( parsed_sentence_org_file) file_tool.save_data_pickle(parsed_sentence_dict, parsed_sentence_dict_file) if len(parsed_sentence_dict) != len(self.sentence_dict): # raise ValueError("parsed_sentence_dict not march sentence_dict") pass if not general_tool.compare_two_dict_keys(self.sentence_dict.copy(), parsed_sentence_dict.copy()): raise ValueError("parsed_sentence_dict not march sentence_dict") # for sent_id, info in parsed_sentence_dict.items(): # if info['original'] != self.sentence_dict[sent_id].original: # raise ValueError("parsed_sentence_dict not march sentence_dict") for sent_id, parse_info in parsed_sentence_dict.items(): sent_id = str(sent_id) self.sentence_dict[sent_id].parse_info = parse_info self.parse_info = parser_tool.process_parsing_sentence_dict( parsed_sentence_dict, modify_dep_name=True) numeral_sentence_dict = self.parse_info.numeral_sentence_dict if not general_tool.compare_two_dict_keys( self.sentence_dict.copy(), numeral_sentence_dict.copy()): raise ValueError("numeral_sentence_dict not march sentence_dict") for sent_id in self.sentence_dict.keys(): self.sentence_dict[sent_id].syntax_info = numeral_sentence_dict[ sent_id] # print('the count of dep type:{}'.format(self.parse_info.dependency_count)) # print('the max len of sentence_tokens:{}'.format(self.parse_info.max_sent_len)) pass
def parse_sentences(self): parsed_sentence_org_file = file_tool.connect_path( self.data_path, 'parsed_sentences.txt') parsed_sentence_dict_file = file_tool.connect_path( self.data_path, 'parsed_sentence_dict.pkl') if file_tool.check_file(parsed_sentence_dict_file): parsed_sentence_dict = file_tool.load_data_pickle( parsed_sentence_dict_file) else: parsed_sentence_dict = parser_tool.extra_parsed_sentence_dict_from_org_file( parsed_sentence_org_file) file_tool.save_data_pickle(parsed_sentence_dict, parsed_sentence_dict_file) if len(parsed_sentence_dict) != len(self.sentence_dict): raise ValueError("parsed_sentence_dict not march sentence_dict") if not general_tool.compare_two_dict_keys(self.sentence_dict.copy(), parsed_sentence_dict.copy()): raise ValueError("parsed_sentence_dict not march sentence_dict") for sent_id, info in parsed_sentence_dict.items(): if info['original'] != self.sentence_dict[sent_id].original: raise ValueError( "parsed_sentence_dict not march sentence_dict") for sent_id, parse_info in parsed_sentence_dict.items(): sent_id = str(sent_id) self.sentence_dict[sent_id].parse_info = parse_info self.parse_info = parser_tool.process_parsing_sentence_dict( parsed_sentence_dict, modify_dep_name=True) numeral_sentence_dict = self.parse_info.numeral_sentence_dict self.max_sent_len = self.parse_info.max_sent_len if not general_tool.compare_two_dict_keys( self.sentence_dict.copy(), numeral_sentence_dict.copy()): raise ValueError("numeral_sentence_dict not march sentence_dict") for sent_id in self.sentence_dict.keys(): self.sentence_dict[sent_id].syntax_info = numeral_sentence_dict[ sent_id] pass
def __extra_examples_from_org_file__(self, org_file, des_filename): if file_tool.check_file(des_filename): examples_dicts = file_tool.load_data_pickle(des_filename) return examples_dicts example_dicts = [] rows = file_tool.load_data(org_file, mode='r') examples_id = 0 for i, row in enumerate(rows): result = row.split("\t") if i == 0: continue if len(result) != 5: raise RuntimeError example_temp = { 'sent_id1': int(result[1]), 'sent_id2': int(result[2]), 'label': int(result[0]), 'id': examples_id } example_dicts.append(example_temp) examples_id += 1 file_tool.save_data_pickle(example_dicts, des_filename) return example_dicts
def __extra_sentences_from_org_file__(self, org_file, des_filename): if file_tool.check_file(des_filename): sentence_dict = file_tool.load_data_pickle(des_filename) return sentence_dict sentence_dict = {} rows = file_tool.load_data(org_file, mode='r') for i, row in enumerate(rows): result = row.split("\t") if i == 0: continue if len(result) != 7: raise RuntimeError if not general_tool.is_number(result[0]): raise RuntimeError if str(result[0]) in sentence_dict: raise RuntimeError sentence_dict[str(result[0])] = str(result[1]) file_tool.save_data_pickle(sentence_dict, des_filename) return sentence_dict