def DatasetExtendToSize(readCsvOnLocal=True, train_size=32, val_size=32, classify_count=10): ''' :param readCsvOnLocal: 设置True运行在本地使用,设置FALSE 运行在flyai服务器上使用 :param size: 每类的数据集扩容到size大小 :param classify_count: 分类的数量 :return: flyai的dataset类 ''' # step 0 : read csv # flyai_source = readCsv_onFlyai(readCsvOnLocal) flyai_source = SourceByWangyi().source_csv # step 1 : csv to dataframe dataframe_train = pd.DataFrame(data=flyai_source.data) dataframe_test = pd.DataFrame(data=flyai_source.val) # step 2 : extend csv(dataframe) dataframe_train = ExtendCsvToSize(dataframe_train, size=train_size, classify_count=classify_count) dataframe_test = ExtendCsvToSize(dataframe_test, size=val_size, classify_count=classify_count) # step 3 : save csv dataframe_train.to_csv(os.path.join(DATA_PATH, 'wangyi-train.csv'), index=False) dataframe_test.to_csv(os.path.join(DATA_PATH, 'wangyi-test.csv'), index=False) # step 4 : load to flyai.dataset dataset_extend_newone = Dataset( source=readCustomCsv("wangyi-train.csv", "wangyi-test.csv")) return dataset_extend_newone
def predict_to_csv(self): save_file_name = 'upload-by-' + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())+'.csv') save_file_name = os.path.join(os.curdir, 'data', 'output', save_file_name) # 1. 创建文件对象 with open(save_file_name, 'w', encoding='utf-8',newline='' "") as f: # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 3. 构建列表头 csv_writer.writerow(["image_path", "labels"]) url_list = self.read_predict_Csv() dataset = Dataset(epochs=5, batch=16) model = Model(dataset) predict_list = [] for row in url_list: predict_num = model.predict(image_path=row) # csv_writer.writerow([row, str(predict_num)]) # predict_list.append(predict_num) predict_list.append([row,predict_num]) # 打印进度条 count_now = len(predict_list) count_total = len(url_list) print('\r'+'预测集进度:'+str(count_now)+'/'+ str(count_total), '----{:.2%}'.format(count_now/count_total),end='', flush=True) csv_writer.writerows(predict_list) print('\n已保存CSV到 ',save_file_name)
def getDatasetListByClassfy(classify_count=3): test_source = SourceByWangyi() xx, yy = test_source.get_sliceCSVbyClassify(classify_count=classify_count) list_tmp = [] for epoch in range(classify_count): dataset = Dataset(source=readCustomCsv(xx[epoch], yy[epoch])) list_tmp.append(dataset) return list_tmp
def generate(self): self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) source, target, _, _ = self.dataset.get_all_data() source = np.asarray([i['source'].split(' ') for i in source]) target = np.asarray([i['target'].split(' ') for i in target]) index = [i for i in range(len(source))] np.random.shuffle(np.asarray(index)) train_source, dev_source = source[index[0:int(len(index) * 0.9)]], source[index[int(len(index) * 0.9):]] train_target, dev_target = target[index[0:int(len(index) * 0.9)]], target[index[int(len(index) * 0.9):]] return train_source, train_target, dev_source, dev_target
def __init__(self, exec_type='train'): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH) self.model_dir = os.path.join(os.getcwd(), arguments.model_dir) # 1. Split the data, read into defined format label2idx = dict((arguments.labels[i], i) for i in range(len(arguments.labels))) target_text, stance, _, _ = self.dataset.get_all_data() indexes = [" ".join(jieba.cut(i['TARGET'].lower(), cut_all=False)) for i in target_text] questions = [" ".join(jieba.cut(i['TEXT'].lower(), cut_all=False)) for i in target_text] labels = [i['STANCE'] for i in stance] data = [indexes, questions, labels] assert len(data[0]) == len(data[1]) == len(data[2]) # 2. Data follows this order: train, test train_num = int(len(data[0]) * arguments.portion) train_data = [d[:train_num] for d in data] dev_data = [d[train_num:] for d in data] # 3. Read the vocab text file and get VOCAB dictionary vocab = read_emb(filename=os.path.join(os.getcwd(), arguments.sgns_dir), stat_lines=1) # 4. Transform text into indexes self.datasets, word2idx, embeddings = make_datasets(vocab=vocab, raw_data={'training': train_data, 'validation': dev_data}, label2idx=label2idx, big_voc=arguments.big_voc, feat_names=arguments.feat_names) self.datasets_train = load_tvt(tvt_set=self.datasets['training'], max_lens=[arguments.ans_len, arguments.ask_len], feat_names=arguments.feat_names) self.datasets_dev = load_tvt(tvt_set=self.datasets['validation'], max_lens=[arguments.ans_len, arguments.ask_len], feat_names=arguments.feat_names) idx2word = dict((v, k) for k, v in word2idx.items()) self.datasets["word2idx"] = word2idx self.datasets["idx2word"] = idx2word self.embeddings = torch.from_numpy(np.asarray(embeddings, dtype=np.float32)) if exec_type == 'train': self.main() else: model = load_torch_model(self.model_dir) test(model=model, dataset=self.datasets, test_set=None)
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert( max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join( os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self .arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to( self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[ self.arguments.dataset_file['train'], self.arguments.dataset_file['test'] ], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(self.arguments.embed_dim), self.arguments.dataset)) self.model = self.arguments.model_class( embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated( device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments)
def generate(self): self.data = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) audio_paths, labels, _, _ = self.data.get_all_data() # wav文件路径 audio_paths = [i['audio_path'] for i in audio_paths] # wav文本数据 TODO:此处包含空格, 测试去掉空格能否提升模型性能 audio_labels = [] # wav文本拼音 audio_pinyins = [] for label in labels: label = label['label'].split(' ') audio_labels.append(''.join(label)) audio_pinyins.append(' '.join([ ' '.join([ ' '.join(j) for j in pinyin(i, style=Style.TONE3, heteronym=False) ]) for i in label ])) # 构建字典 for label in labels: self.sortedDict.append_tokens(label) self.sortedDict.dump_pkl() # 划分训练/验证 audio_paths = np.asarray(audio_paths) audio_labels = np.asarray(audio_labels) audio_pinyins = np.asarray(audio_pinyins) index = [i for i in range(len(audio_paths))] np.random.shuffle(np.asarray(index)) train_audio_paths, dev_audio_paths = audio_paths[ index[0:int(len(index) * 0.9)]], audio_paths[index[int(len(index) * 0.9):]] train_labels, dev_labels = audio_labels[ index[0:int(len(index) * 0.9)]], audio_labels[index[int(len(index) * 0.9):]] train_pinyins, dev_pinyins = audio_pinyins[ index[0:int(len(index) * 0.9)]], audio_pinyins[index[int(len(index) * 0.9):]] return train_audio_paths.tolist(), train_labels.tolist( ), train_pinyins.tolist(), dev_audio_paths.tolist(), dev_labels.tolist( ), dev_pinyins.tolist()
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join(os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset) ) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset) ) self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info( 'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments) target_text, stance, _, _ = self.dataset.get_all_data() target = np.asarray([i['TARGET'].lower() for i in target_text]) text = np.asarray([i['TEXT'].lower() for i in target_text]) stance = np.asarray([i['STANCE'] for i in stance]) self.target_set = set() for tar in target: self.target_set.add(tar) text = PreProcessing(text).get_file_text() trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer) valset_len = int(len(trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split(trainset, (len(trainset) - valset_len, valset_len))
def generate(self): self.dataset = Dataset(epochs=self.arguments.EPOCHS, batch=self.arguments.BATCH, val_batch=self.arguments.BATCH) news, category, _, _ = self.dataset.get_all_data() news = np.asarray([i['news'] for i in news]) category = np.asarray([i['category'] for i in category]) index = [i for i in range(len(news))] np.random.shuffle(np.asarray(index)) train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[ index[int(len(index) * 0.9):]] train_category, dev_category = category[ index[0:int(len(index) * 0.9)]], category[index[int(len(index) * 0.9):]] return train_news, train_category, dev_news, dev_category
def __init__(self, exec_type="train"): parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=10, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=24, type=int, help="batch size") args = parser.parse_args() self.batch_size = args.BATCH self.epochs = args.EPOCHS self.learning_rate = arguments.learning_rate self.embedding_size = arguments.embedding_size self.hidden_size = arguments.hidden_size self.tags = arguments.tags self.dropout = arguments.dropout self.tag_map = {label: i for i, label in enumerate(arguments.labels)} if exec_type == "train": self.model = Net( tag_map=self.tag_map, batch_size=self.batch_size, dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) else: self.model = None self.dataset = Dataset(epochs=self.epochs, batch=self.batch_size)
def __init__(self, args): self.args = args self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)
x_data = self.data.predict_data(**data) predict = self.model_predict(x_data) predict = self.data.to_categorys(predict) return predict def predict_all(self, datas): predicts = [] y_pred_cls = self.outputParams['y_pred_cls'] input_x = self.inputParams['input_x'] keep_prob = self.inputParams['keep_prob'] saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, os.path.join(MODEL_PATH, TENSORFLOW_MODEL_DIR)) for data in datas: x_data = self.data.predict_data(**data) predict = sess.run(y_pred_cls, feed_dict={input_x: x_data, keep_prob: 1.0}) predict = self.data.to_categorys(predict) predicts.append(predict) return predicts if __name__ == '__main__': # inputParams, outputParams, summaryParams = create_model(Processor().getWordsCount()) # train_model(inputParams, outputParams, summaryParams,needInit=False) dataset = Dataset(train_batch=128, val_batch=64, split_ratio = 0.9,) model = Model(dataset) predic = model.predict(text="您好!我们这边是施华洛世奇鄞州万达店!您是我们尊贵的会员,特意邀请您参加我们x.x-x.x的三八女人节活动!满xxxx元享晶璨花漾丝巾") print('xxx')
import os from flyai.dataset import Dataset from model import Model data = Dataset(epochs=10, batch=32) model = Model(data) x_val, y_val = data.next_validation_batch() #val = {y_val: x_val} # x_train, y_train, x_test, y_test = data.next_batch(32) # feed_dict={x: x_train} print(x_val.shape) #print(y_val) print(x_val[0].shape) val = {x_val[0]: '0'} p = model.predict_all(val)
return labels def batch_iter(self, x, y, batch_size=128): """生成批次数据""" data_len = len(x) num_batch = int((data_len - 1) / batch_size) + 1 indices = numpy.random.permutation(numpy.arange(data_len)) x_shuffle = x[indices] y_shuffle = y[indices] for i in range(num_batch): start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] def save_model(self, network, path, name=Torch_MODEL_NAME, overwrite=False): super().save_model(network, path, name, overwrite) #torch.save(network, os.path.join(path, name)) torch.save(network.state_dict(), os.path.join(path, name)) if __name__ == '__main__': from flyai.dataset import Dataset dataset = Dataset(epochs=1, batch=2) m = Model(dataset) print(m.predict_all([{'image_path': "img/381.tif"}]))
def get_dataset(cls,): if cls.dataset is not None: return cls.dataset else: cls.dataset = Dataset(epochs=3, batch=3, val_batch=3) return cls.dataset
def eval_one_batch(preds): dataset = Dataset() model = Model(dataset) try: x_test, y_test = dataset.evaluate_data_no_processor("test.csv") print('eval.py use test.csv') except: x_test, y_test = dataset.evaluate_data_no_processor("dev.csv") print('eval.py use dev.csv') randnum = random.randint(0, 100) random.seed(randnum) random.shuffle(x_test) random.seed(randnum) random.shuffle(y_test) # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...] preds = model.predict_all(x_test) # 加载标签 [{'boxes':[], 'labels':[], 'image_id':[]}, ...] targets = [] for i in range(len(y_test)): label_path = y_test[i]['label_path'] # label/019646.jpg.txt boxes = [] labels = [] image_id = [] image_id.append(x_test[i]['image_path']) with open(os.path.join(DATA_PATH, label_path)) as f: for line in f.readlines(): # 1954.7443195924375,695.1497671989313,1984.659514688955,738.4779589540301,1933 temp = line.strip().split(',') xmin = int(float(temp[0])) ymin = int(float(temp[1])) xmax = int(float(temp[2])) ymax = int(float(temp[3])) boxes.append([xmin, ymin, xmax, ymax]) labels.append(int(temp[4])) target = {} target["boxes"] = boxes target["labels"] = labels target["image_id"] = image_id targets.append(target) ''' if不需要修改 ''' if len(y_test) != len(x_test): result = dict() result['score'] = 0 result['label'] = "评估违规" result['info'] = "" print(json.dumps(result)) else: ''' 在下面实现不同的评估算法 ''' # 开始计算最后得分 sum_ap = 0 all_labels = [i for i in range(2)] # 所有目标类别 # 以下是我自己加的 for label in all_labels: # 逐个类别计算ap prediction1 = [] # 在计算 ap 的时候,需要把prediction按照最后预测的类别进行筛选 for pred in preds: if pred[3] == label: prediction1.append([ pred[0], pred[1], pred[2][0], pred[2][1], pred[2][2], pred[2][3] ]) if len(prediction1) != 0: # 当包含预测框的时候,进行计算ap值 rec, prec, ap = voc_eval(targets, prediction1, label) else: ap = 0 sum_ap += ap map = sum_ap / len(all_labels) result = dict() result['score'] = round(map * 100, 2) result['label'] = "The Score is MAP." result['info'] = "" print(json.dumps(result)) return map
''' 实现模型的调用 ''' from flyai.dataset import Dataset from model import Model data = Dataset() model = Model(data) p = model.predict( text='gute lage im stadtzentrum. shoppingmeile und sehensw rdigkeiten, sowie gute pubs in laufweite. das hotel ist neu, gut gepflegt und hat bem htes nettes personal. ideal f r einen kurztrip nach edinburgh. l ngere aufenthalte eher nicht, da die zimmer recht klein sind.') print(p)
常见问题请访问:https://www.flyai.com/question 意见和问题反馈有红包哦!添加客服微信:flyaixzs ''' ''' 项目的超参 ''' parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=1, type=int, help="batch size") args = parser.parse_args() ''' flyai库中的提供的数据处理方法 传入整个数据训练多少轮,每批次批大小 ''' print('batch_size: %d, epoch_size: %d' % (args.BATCH, args.EPOCHS)) dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH, val_batch=32) model = Model(dataset) print("number of train examples:%d" % dataset.get_train_length()) print("number of validation examples:%d" % dataset.get_validation_length()) # region 超参数 n_classes = 45 fc1_dim = 512 # endregion # region 定义输入变量 x_inputs = tf.placeholder(shape=(None, 224, 224, 3), dtype=tf.float32, name='x_inputs') y_inputs = tf.placeholder(shape=(None, n_classes), dtype=tf.float32,
# -*- coding: utf-8 -* from flyai.dataset import Dataset from model import Model # from dataset import Dataset from processor import Processor # dataset = Dataset(train_batch=128, val_batch=64, split_ratio=0.9, ) dataset = Dataset(batch=32, val_batch=32) model = Model(dataset) # # predict = model.predict(text="您好!我们这边是施华洛世奇鄞州万达店!您是我们尊贵的会员,特意邀请您参加我们x.x-x.x的三八女人节活动!满xxxx元享晶璨花漾丝巾") # print(predict) labels = [ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=4, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert( max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join( os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self .arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to( self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[ self.arguments.dataset_file['train'], self.arguments.dataset_file['test'] ], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(self.arguments.embed_dim), self.arguments.dataset)) self.model = self.arguments.model_class( embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated( device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments) target_text, stance, _, _ = self.dataset.get_all_data() target = np.asarray([i['TARGET'].lower() for i in target_text]) text = np.asarray([i['TEXT'].lower() for i in target_text]) stance = np.asarray([i['STANCE'] for i in stance]) # ############################# 特征词库的方法 效果不好 # train_data = pd.DataFrame(data=[stance, target, text]).T # train_data.columns = ['STANCE', 'TARGET', 'TEXT'] # Util.calculate_word_count(train_data) # ############################# 特征词库的方法 效果不好 self.target_set = set() for tar in target: self.target_set.add(tar) text = PreProcessing(text).get_file_text() # ############################# 同义词替换的方法 效果不好 # self.synonyms = SynonymsReplacer() # text_add = [] # for index in range(len(text)): # text_add.append(self.synonyms.get_syno_sents_list(text[index])) # target = np.append(target, target) # text = np.append(text, np.asarray(text_add)) # stance = np.append(stance, stance) # ############################# 同义词替换的方法 效果不好 print('target.shape: {}, text.shape: {}, stance.shape: {}'.format( target.shape, text.shape, stance.shape)) trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer) valset_len = int(len(trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split( trainset, (len(trainset) - valset_len, valset_len))
# author=yphacker import argparse import numpy as np import tensorflow as tf from flyai.dataset import Dataset import config from model import Model from cnn_model import CNN parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=8, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size") args = parser.parse_args() dataset = Dataset(batch=args.BATCH, epochs=args.EPOCHS) modelpp = Model(dataset) model = CNN() # with tf.Session() as sess: # sess.run(tf.global_variables_initializer()) # train_writer = tf.summary.FileWriter(LOG_PATH, sess.graph) # # # dataset.get_step() 获取数据的总迭代次数 # for step in range(dataset.get_step()): # x_train, y_train = dataset.next_train_batch() # x_val, y_val = dataset.next_validation_batch() # # fetches = [loss, accuracy, train_op] # feed_dict = {input_x: x_train, input_y: y_train, keep_prob: 0.5} # loss_, accuracy_, _ = sess.run(fetches, feed_dict=feed_dict)
return labels ''' 保存模型的方法 ''' def save_model(self, model, path, name=KERAS_MODEL_NAME, overwrite=False): super().save_model(model, path, name, overwrite) model.save(os.path.join(path, name)) if __name__ == '__main__': print('ojbk') dataset = Dataset(epochs=5, batch=16) model = Model(dataset) p = model.predict_all([]) print(p) x,y = dataset.next_train_batch() a = { 'images/00007635_001.png': 0, 'images/00002573_000.png': 0, 'images/00000368_005.png': 0, } a1 = { 'images/00007635_001.png':0 } aa = [
def __init__(self, path=MODEL_PATH, name='final.h5'): self.data = Dataset() from keras.utils import CustomObjectScope with CustomObjectScope({'AttLayer': AttLayer}): self.model = load_model(os.path.join(path, name))
# -*- coding: utf-8 -* ''' 实现模型的调用 ''' from flyai.dataset import Dataset from model import Model data = Dataset() model = Model(data) dataset = Dataset() x_test, y_test = dataset.evaluate_data_no_processor('dev.csv') preds = model.predict_all(x_test)
def __init__(self): self.args = args self.dataset = Dataset(epochs=self.args.total_epochs, batch=self.args.batch_size, val_batch=self.args.batch_size)
else: ap = 0 sum_ap += ap map = sum_ap / len(all_labels) result = dict() result['score'] = round(map * 100, 2) result['label'] = "The Score is MAP." result['info'] = "" print(json.dumps(result)) return map if __name__ == "__main__": dataset = Dataset() model = Model(dataset) try: x_test, y_test = dataset.evaluate_data_no_processor("test.csv") print('eval.py use test.csv') except: x_test, y_test = dataset.evaluate_data_no_processor("dev.csv") print('eval.py use dev.csv') randnum = random.randint(0, 100) random.seed(randnum) random.shuffle(x_test) random.seed(randnum) random.shuffle(y_test) # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...] preds = model.predict_all(x_test)
参数为csv中作为输入x的一条数据,该方法会被dataset.next_train_batch() 和dataset.next_validation_batch()多次调用。评估的时候会调用该方法做数据处理 该方法字段与app.yaml中的input:->columns:对应 ''' def output_x(self, TARGET, TEXT): text2vec = self.input_x(TARGET, TEXT) return text2vec ''' 输出的结果,会被dataset.to_categorys(data)调用 ''' def output_y(self, data): index = np.argmax(data) return index if __name__ == '__main__': from flyai.dataset import Dataset dataset = Dataset(10, 32) train_x, train_y, val_x, val_y = dataset.get_all_data() preTrainedEmbedding = PreTrainedEmbedding() contents = [x['TEXT'] for x in train_x] unfounds = [] for words in contents: print(words) vector, unfound = preTrainedEmbedding.turnToVectors(words) unfounds.append(unfound) print("unfound probability is: %f", np.mean(unfounds))
第一次使用请看项目中的:第一次使用请读我.html文件 常见问题请访问:https://www.flyai.com/question 意见和问题反馈有红包哦!添加客服微信:flyaixzs ''' ''' 项目中的超参 ''' parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=64, type=int, help="batch size") args = parser.parse_args() ''' flyai库中的提供的数据处理方法 传入整个数据训练多少轮,每批次批大小 ''' dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) model = Model(dataset) word_dict, word_dict_res = load_dict() vocab_size = max(word_dict.values()) + 1 # 超参 embedding_dim = 64 # 嵌入层大小 dnn_dim = 128 # Dense层大小 max_seq_len = 128 # 最大句长 num_filters = 64 # 卷积核数目 kernel_size = 5 # 卷积核尺寸 learning_rate = 1e-3 # 学习率 numclass = 2 # 类别数 # 传值空间
def main(): """ 项目的超参 """ parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=8, type=int, help="batch size") args = parser.parse_args() # ------------------判断CUDA模式---------------------- if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' device = torch.device(device) # ------------------预处理数据---------------------- dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) network = Net.from_pretrained(arguments.bert_model, num_tag=len(arguments.labels)).to(device) logger.info('\n预处理结束!!!\n') # ---------------------优化器------------------------- param_optimizer = list(network.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] t_total = int(dataset.get_train_length() / arguments.gradient_accumulation_steps / args.BATCH * args.EPOCHS) # ---------------------GPU半精度fp16----------------------------- if arguments.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=arguments.learning_rate, bias_correction=False, max_grad_norm=1.0) if arguments.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=arguments.loss_scale) # ------------------------GPU单精度fp32--------------------------- else: optimizer = BertAdam(optimizer_grouped_parameters, lr=arguments.learning_rate, warmup=arguments.warmup_proportion, t_total=t_total ) # ---------------------模型初始化---------------------- if arguments.fp16: network.half() train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] best_f1 = 0 start = time.time() global_step = 0 for e in range(args.EPOCHS): network.train() for step in range(dataset.get_step() // args.EPOCHS): x_train, y_train = dataset.next_train_batch() batch = create_batch_iter(mode='train', X=x_train, y=y_train).dataset.tensors batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, output_mask = batch bert_encode = network(input_ids, segment_ids, input_mask) train_loss = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) if arguments.gradient_accumulation_steps > 1: train_loss = train_loss / arguments.gradient_accumulation_steps if arguments.fp16: optimizer.backward(train_loss) else: train_loss.backward() if (step + 1) % arguments.gradient_accumulation_steps == 0: def warmup_linear(x, warmup=0.002): if x < warmup: return x / warmup return 1.0 - x # modify learning rate with special warm up BERT uses lr_this_step = arguments.learning_rate * warmup_linear(global_step / t_total, arguments.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 predicts = network.predict(bert_encode, output_mask) label_ids = label_ids.view(1, -1) label_ids = label_ids[label_ids != -1] label_ids = label_ids.cpu() train_acc, f1 = network.acc_f1(predicts, label_ids) logger.info("\n train_acc: %f - train_loss: %f - f1: %f - using time: %f - step: %d \n" % (train_acc, train_loss.item(), f1, ( time.time() - start), step)) # -----------------------验证---------------------------- network.eval() count = 0 y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): for step in range(dataset.get_step() // args.EPOCHS): x_val, y_val = dataset.next_validation_batch() batch = create_batch_iter(mode='dev', X=x_val, y=y_val).dataset.tensors batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, output_mask = batch bert_encode = network(input_ids, segment_ids, input_mask).cpu() eval_los = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) eval_loss = eval_los + eval_loss count += 1 predicts = network.predict(bert_encode, output_mask) y_predicts.append(predicts) label_ids = label_ids.view(1, -1) label_ids = label_ids[label_ids != -1] y_labels.append(label_ids) eval_predicted = torch.cat(y_predicts, dim=0).cpu() eval_labeled = torch.cat(y_labels, dim=0).cpu() print('eval:') print(eval_predicted.numpy().tolist()) print(eval_labeled.numpy().tolist()) eval_acc, eval_f1 = network.acc_f1(eval_predicted, eval_labeled) network.class_report(eval_predicted, eval_labeled) logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(network, arguments.output_dir) if e % 1 == 0: train_losses.append(train_loss.item()) train_accuracy.append(train_acc) eval_losses.append(eval_loss.item() / count) eval_accuracy.append(eval_acc)