class Instructor(object): """ 特点: 使用flyai字典的get all data | flyai提供的next batch """ def __init__(self, args): self.args = args self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) def run(self): best_err1 = 100. best_epoch = 0 logger.info('==> creating model "{}"'.format(args.model_name)) model = Util.getModel(**vars(args)) model = model.to(DEVICE) # 大部分情况下,设置这个flag可以让内置的cuDNN的auto - tuner自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题。 cudnn.benchmark = True # define loss function (criterion) and pptimizer # criterion = nn.CrossEntropyLoss().to(DEVICE) # 标签平滑 criterion = LabelSmoothingLoss(classes=self.args.num_classes, smoothing=0.1) # Focal Loss # criterion = FocalLoss(class_num=self.args.num_classes) # define optimizer optimizer = Util.getOptimizer(model=model, args=self.args) trainer = Trainer_1(dataset=self.dataset, criterion=criterion, optimizer=optimizer, args=self.args, logger=logger) logger.info('train: {} test: {}'.format( self.dataset.get_train_length(), self.dataset.get_validation_length())) for epoch in range(0, self.args.EPOCHS): # train for one epoch model = trainer.train(model=model, epoch=epoch) # evaluate on validation set model, val_err1 = trainer.test(model=model, epoch=epoch) # remember best err@1 and save checkpoint is_best = val_err1 < best_err1 if is_best: best_err1 = val_err1 best_epoch = epoch logger.info('Best var_err1 {}'.format(best_err1)) Util.save_checkpoint(model.state_dict(), is_best, args.output_models_dir) if not is_best and epoch - best_epoch >= args.patience > 0: break logger.info('Best val_err1: {:.4f} at epoch {}'.format( best_err1, best_epoch))
def generate(self): self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) source, target, _, _ = self.dataset.get_all_data() source = np.asarray([i['source'].split(' ') for i in source]) target = np.asarray([i['target'].split(' ') for i in target]) index = [i for i in range(len(source))] np.random.shuffle(np.asarray(index)) train_source, dev_source = source[index[0:int(len(index) * 0.9)]], source[index[int(len(index) * 0.9):]] train_target, dev_target = target[index[0:int(len(index) * 0.9)]], target[index[int(len(index) * 0.9):]] return train_source, train_target, dev_source, dev_target
def __init__(self, exec_type='train'): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH) self.model_dir = os.path.join(os.getcwd(), arguments.model_dir) # 1. Split the data, read into defined format label2idx = dict((arguments.labels[i], i) for i in range(len(arguments.labels))) target_text, stance, _, _ = self.dataset.get_all_data() indexes = [" ".join(jieba.cut(i['TARGET'].lower(), cut_all=False)) for i in target_text] questions = [" ".join(jieba.cut(i['TEXT'].lower(), cut_all=False)) for i in target_text] labels = [i['STANCE'] for i in stance] data = [indexes, questions, labels] assert len(data[0]) == len(data[1]) == len(data[2]) # 2. Data follows this order: train, test train_num = int(len(data[0]) * arguments.portion) train_data = [d[:train_num] for d in data] dev_data = [d[train_num:] for d in data] # 3. Read the vocab text file and get VOCAB dictionary vocab = read_emb(filename=os.path.join(os.getcwd(), arguments.sgns_dir), stat_lines=1) # 4. Transform text into indexes self.datasets, word2idx, embeddings = make_datasets(vocab=vocab, raw_data={'training': train_data, 'validation': dev_data}, label2idx=label2idx, big_voc=arguments.big_voc, feat_names=arguments.feat_names) self.datasets_train = load_tvt(tvt_set=self.datasets['training'], max_lens=[arguments.ans_len, arguments.ask_len], feat_names=arguments.feat_names) self.datasets_dev = load_tvt(tvt_set=self.datasets['validation'], max_lens=[arguments.ans_len, arguments.ask_len], feat_names=arguments.feat_names) idx2word = dict((v, k) for k, v in word2idx.items()) self.datasets["word2idx"] = word2idx self.datasets["idx2word"] = idx2word self.embeddings = torch.from_numpy(np.asarray(embeddings, dtype=np.float32)) if exec_type == 'train': self.main() else: model = load_torch_model(self.model_dir) test(model=model, dataset=self.datasets, test_set=None)
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert( max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join( os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self .arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to( self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[ self.arguments.dataset_file['train'], self.arguments.dataset_file['test'] ], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(self.arguments.embed_dim), self.arguments.dataset)) self.model = self.arguments.model_class( embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated( device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments)
def DatasetExtendToSize(readCsvOnLocal=True, train_size=32, val_size=32, classify_count=10): ''' :param readCsvOnLocal: 设置True运行在本地使用,设置FALSE 运行在flyai服务器上使用 :param size: 每类的数据集扩容到size大小 :param classify_count: 分类的数量 :return: flyai的dataset类 ''' # step 0 : read csv # flyai_source = readCsv_onFlyai(readCsvOnLocal) flyai_source = SourceByWangyi().source_csv # step 1 : csv to dataframe dataframe_train = pd.DataFrame(data=flyai_source.data) dataframe_test = pd.DataFrame(data=flyai_source.val) # step 2 : extend csv(dataframe) dataframe_train = ExtendCsvToSize(dataframe_train, size=train_size, classify_count=classify_count) dataframe_test = ExtendCsvToSize(dataframe_test, size=val_size, classify_count=classify_count) # step 3 : save csv dataframe_train.to_csv(os.path.join(DATA_PATH, 'wangyi-train.csv'), index=False) dataframe_test.to_csv(os.path.join(DATA_PATH, 'wangyi-test.csv'), index=False) # step 4 : load to flyai.dataset dataset_extend_newone = Dataset( source=readCustomCsv("wangyi-train.csv", "wangyi-test.csv")) return dataset_extend_newone
def predict_to_csv(self): save_file_name = 'upload-by-' + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())+'.csv') save_file_name = os.path.join(os.curdir, 'data', 'output', save_file_name) # 1. 创建文件对象 with open(save_file_name, 'w', encoding='utf-8',newline='' "") as f: # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 3. 构建列表头 csv_writer.writerow(["image_path", "labels"]) url_list = self.read_predict_Csv() dataset = Dataset(epochs=5, batch=16) model = Model(dataset) predict_list = [] for row in url_list: predict_num = model.predict(image_path=row) # csv_writer.writerow([row, str(predict_num)]) # predict_list.append(predict_num) predict_list.append([row,predict_num]) # 打印进度条 count_now = len(predict_list) count_total = len(url_list) print('\r'+'预测集进度:'+str(count_now)+'/'+ str(count_total), '----{:.2%}'.format(count_now/count_total),end='', flush=True) csv_writer.writerows(predict_list) print('\n已保存CSV到 ',save_file_name)
def generate(self): self.data = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) audio_paths, labels, _, _ = self.data.get_all_data() # wav文件路径 audio_paths = [i['audio_path'] for i in audio_paths] # wav文本数据 TODO:此处包含空格, 测试去掉空格能否提升模型性能 audio_labels = [] # wav文本拼音 audio_pinyins = [] for label in labels: label = label['label'].split(' ') audio_labels.append(''.join(label)) audio_pinyins.append(' '.join([ ' '.join([ ' '.join(j) for j in pinyin(i, style=Style.TONE3, heteronym=False) ]) for i in label ])) # 构建字典 for label in labels: self.sortedDict.append_tokens(label) self.sortedDict.dump_pkl() # 划分训练/验证 audio_paths = np.asarray(audio_paths) audio_labels = np.asarray(audio_labels) audio_pinyins = np.asarray(audio_pinyins) index = [i for i in range(len(audio_paths))] np.random.shuffle(np.asarray(index)) train_audio_paths, dev_audio_paths = audio_paths[ index[0:int(len(index) * 0.9)]], audio_paths[index[int(len(index) * 0.9):]] train_labels, dev_labels = audio_labels[ index[0:int(len(index) * 0.9)]], audio_labels[index[int(len(index) * 0.9):]] train_pinyins, dev_pinyins = audio_pinyins[ index[0:int(len(index) * 0.9)]], audio_pinyins[index[int(len(index) * 0.9):]] return train_audio_paths.tolist(), train_labels.tolist( ), train_pinyins.tolist(), dev_audio_paths.tolist(), dev_labels.tolist( ), dev_pinyins.tolist()
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join(os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset) ) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset) ) self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info( 'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments) target_text, stance, _, _ = self.dataset.get_all_data() target = np.asarray([i['TARGET'].lower() for i in target_text]) text = np.asarray([i['TEXT'].lower() for i in target_text]) stance = np.asarray([i['STANCE'] for i in stance]) self.target_set = set() for tar in target: self.target_set.add(tar) text = PreProcessing(text).get_file_text() trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer) valset_len = int(len(trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split(trainset, (len(trainset) - valset_len, valset_len))
def generate(self): self.dataset = Dataset(epochs=self.arguments.EPOCHS, batch=self.arguments.BATCH, val_batch=self.arguments.BATCH) news, category, _, _ = self.dataset.get_all_data() news = np.asarray([i['news'] for i in news]) category = np.asarray([i['category'] for i in category]) index = [i for i in range(len(news))] np.random.shuffle(np.asarray(index)) train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[ index[int(len(index) * 0.9):]] train_category, dev_category = category[ index[0:int(len(index) * 0.9)]], category[index[int(len(index) * 0.9):]] return train_news, train_category, dev_news, dev_category
def getDatasetListByClassfy(classify_count=3): test_source = SourceByWangyi() xx, yy = test_source.get_sliceCSVbyClassify(classify_count=classify_count) list_tmp = [] for epoch in range(classify_count): dataset = Dataset(source=readCustomCsv(xx[epoch], yy[epoch])) list_tmp.append(dataset) return list_tmp
class Predictor(object): def __init__(self, path=MODEL_PATH, name='final.h5'): self.data = Dataset() from keras.utils import CustomObjectScope with CustomObjectScope({'AttLayer': AttLayer}): self.model = load_model(os.path.join(path, name)) def predict(self, **data): p = self.model.predict(self.data.predict_data(**data)) return p def to_category(self, p): y = self.data.to_categorys(p) return y def predict_to_category(self, **data): p = self.predict(**data) y = self.to_category(p) return y
def __init__(self, exec_type="train"): parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=10, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=24, type=int, help="batch size") args = parser.parse_args() self.batch_size = args.BATCH self.epochs = args.EPOCHS self.learning_rate = arguments.learning_rate self.embedding_size = arguments.embedding_size self.hidden_size = arguments.hidden_size self.tags = arguments.tags self.dropout = arguments.dropout self.tag_map = {label: i for i, label in enumerate(arguments.labels)} if exec_type == "train": self.model = Net( tag_map=self.tag_map, batch_size=self.batch_size, dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) else: self.model = None self.dataset = Dataset(epochs=self.epochs, batch=self.batch_size)
# -*- coding: utf-8 -* ''' 实现模型的调用 ''' from flyai.dataset import Dataset from model import Model data = Dataset() model = Model(data) dataset = Dataset() x_test, y_test = dataset.evaluate_data_no_processor('dev.csv') preds = model.predict_all(x_test)
parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=16, type=int, help="batch size") args = parser.parse_args() ''' flyai库中的提供的数据处理方法 传入整个数据训练多少轮,每批次批大小 ''' dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) print( f'train datas: {dataset.get_train_length()}, val data: {dataset.get_validation_length()}' ) lr = 1e-4 num_warmup_steps = 1000 max_grad = 1.0 ''' 实现自己的网络机构 ''' # 判断gpu是否可用 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') net = Net().to(device) model = Model(dataset, net) # print(net) # optimizer = torch.optim.Adam(net.parameters(), lr=5e-6)
import argparse import tensorflow as tf from flyai.dataset import Dataset from model import Model # 数据获取辅助类 from path import MODEL_PATH dataset = Dataset() dataset.get_all_processor_data() model = Model(dataset) parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=10, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size") args = parser.parse_args() ''' 使用tensorflow实现自己的算法 ''' # 定义命名空间 x = tf.placeholder(tf.float32, shape=[None, 200, 200, 3], name='input_x') y = tf.placeholder(tf.int64, shape=[None], name='input_y') keep_prob = tf.placeholder(tf.float32, name='keep_prob') # 初始化权值 def weight_variable(shape, name): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial, name=name)
常见问题请访问:https://www.flyai.com/question 意见和问题反馈有红包哦!添加客服微信:flyaixzs ''' ''' 项目的超参 ''' parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=1, type=int, help="batch size") args = parser.parse_args() ''' flyai库中的提供的数据处理方法 传入整个数据训练多少轮,每批次批大小 ''' print('batch_size: %d, epoch_size: %d' % (args.BATCH, args.EPOCHS)) dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH, val_batch=32) model = Model(dataset) print("number of train examples:%d" % dataset.get_train_length()) print("number of validation examples:%d" % dataset.get_validation_length()) # region 超参数 n_classes = 45 fc1_dim = 512 # endregion # region 定义输入变量 x_inputs = tf.placeholder(shape=(None, 224, 224, 3), dtype=tf.float32, name='x_inputs') y_inputs = tf.placeholder(shape=(None, n_classes), dtype=tf.float32,
''' 项目中的超参 ''' parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size") args = parser.parse_args() ''' flyai库中的提供的数据处理方法 传入整个数据训练多少轮,每批次批大小 ''' dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) model = Model(dataset) ''' 实现自己的网络机构 ''' x = tf.placeholder(tf.float32, shape=[None, 28, 28, 1], name='input_x') y = tf.placeholder(tf.float32, shape=[None, 10], name='input_y') keep_prob = tf.placeholder(tf.float32, name='keep_prob') learning_rate = 0.001 ''' dataset.get_step() 获取数据的总迭代次数 ''' # 参数概要
else: ap = 0 sum_ap += ap map = sum_ap / len(all_labels) result = dict() result['score'] = round(map * 100, 2) result['label'] = "The Score is MAP." result['info'] = "" print(json.dumps(result)) return map if __name__ == "__main__": dataset = Dataset() model = Model(dataset) try: x_test, y_test = dataset.evaluate_data_no_processor("test.csv") print('eval.py use test.csv') except: x_test, y_test = dataset.evaluate_data_no_processor("dev.csv") print('eval.py use dev.csv') randnum = random.randint(0, 100) random.seed(randnum) random.shuffle(x_test) random.seed(randnum) random.shuffle(y_test) # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...] preds = model.predict_all(x_test)
class Instructor(object): """ 特点:使用flyai字典的get all data | 自己进行划分next batch """ def __init__(self, arguments): self.arguments = arguments def train(self, train_category, dev_category, train_news, dev_news, tokenizer, Net=None, model=None): if os.path.exists(self.arguments.output_config_file) is True: os.remove(self.arguments.output_config_file) logger.info('>>train.shape: {} | dev.shape: {}'.format( train_category.shape, dev_category.shape)) train_dataloader, train_examples_len = Util.load_data( news=train_news, category=train_category, data_type='train', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) dev_dataloader, dev_examples_len = Util.load_data( news=dev_news, category=dev_category, data_type='dev', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) num_train_optimization_steps = int( train_examples_len / self.arguments.BATCH / self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS # 模型准备 logger.info("model name is {}".format(self.arguments.model_name)) if model is None: if self.arguments.model_name == "BertOrigin": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == 'BertHAN': model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertCNN": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertATT": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertRCNN": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, rnn_hidden_size=self.arguments.rnn_hidden_size, num_layers=self.arguments.num_layers, bidirectional=self.arguments.bidirectional, dropout=self.arguments.dropout) elif self.arguments.model_name == "BertCNNPlus": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes) model.to(DEVICE) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.arguments.learning_rate, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule( optimizer=optimizer, warmup_steps=self.arguments.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ if self.arguments.use_label_smoothing: criterion = NMTCriterion( label_smoothing=self.arguments.label_smoothing) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(DEVICE) best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0 for epoch in range(int(self.arguments.EPOCHS)): if early_stop_times >= self.arguments.early_stop * ( train_examples_len // self.arguments.BATCH): break logger.info(f'---------------- Epoch: {epoch + 1:02} ----------') for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() if self.arguments.label_smoothing: criterion.train() batch = tuple(t.to(DEVICE) for t in batch) _, input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss = criterion(inputs=logits, labels=label_ids, normalization=1.0, reduce=False) # 修正 if self.arguments.gradient_accumulation_steps > 1: loss = loss / self.arguments.gradient_accumulation_steps loss.backward(torch.ones_like(loss)) scheduler.step() if (step + 1) % self.arguments.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % self.arguments.print_step == 0 and global_step != 0: dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate( model, dev_dataloader, criterion, DEVICE, self.arguments.label_list, args=self.arguments) logger.info('\n>>>dev report: \n{}'.format(dev_report)) # 以 acc 取优 if dev_acc > best_acc: best_acc = dev_acc # 以 auc 取优 if dev_auc > best_auc: best_auc = dev_auc # 保存模型 model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) early_stop_times = 0 else: early_stop_times += 1 if os.path.exists(self.arguments.output_config_file) is False: model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) def generate(self): self.dataset = Dataset(epochs=self.arguments.EPOCHS, batch=self.arguments.BATCH, val_batch=self.arguments.BATCH) news, category, _, _ = self.dataset.get_all_data() news = np.asarray([i['news'] for i in news]) category = np.asarray([i['category'] for i in category]) index = [i for i in range(len(news))] np.random.shuffle(np.asarray(index)) train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[ index[int(len(index) * 0.9):]] train_category, dev_category = category[ index[0:int(len(index) * 0.9)]], category[index[int(len(index) * 0.9):]] return train_news, train_category, dev_news, dev_category def run(self): remote_helper.get_remote_date( "https://www.flyai.com/m/chinese_base.zip") before_vocab_dir = os.path.join(os.getcwd(), 'vocab.txt') after_vocab_dir = os.path.join(args.bert_model_dir, 'vocab.txt') logger.info('>before_vocab_dir:{}'.format(before_vocab_dir)) logger.info('>after_vocab_dir:{}'.format(after_vocab_dir)) shutil.copyfile(before_vocab_dir, after_vocab_dir) if not os.path.exists(self.arguments.output_dir): os.mkdir(self.arguments.output_dir) self.arguments.BATCH = self.arguments.BATCH // self.arguments.gradient_accumulation_steps # 数据准备 分词器选择 tokenizer = BertTokenizer( self.arguments.bert_vocab_file).from_pretrained( self.arguments.bert_model_dir, do_lower_case=self.arguments.do_lower_case) # 获取数据 news/keywords train_news, train_category, dev_news, dev_category = self.generate() self.train(Net=Net, train_category=train_category, dev_category=dev_category, train_news=train_news, dev_news=dev_news, tokenizer=tokenizer)
from flyai.dataset import Dataset from model import Model data = Dataset() model = Model(data) # p = model.predict(age=53, sex=1, cp=3, trestbps=130, chol=246, fbs=1, restecg=2, thalach=173, exang=0, oldpeak=0.0, # slope=1, ca=3, thal=3) # print(p) tData = data.get_all_data() preds = model.predict_all(tData[0]) y_test = [] for label in tData[1]: y_test.append(label['label']) rCount = 0.0 for i in range(0, len(preds)): if preds[i] == y_test[i]: rCount += 1. test_acc = rCount / len(preds) print('accuracy %g' % test_acc)
type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=64, type=int, help="batch size") parser.add_argument("-vb", "--VAL_BATCH", default=64, type=int, help="val batch size") args = parser.parse_args() # 在本样例中, args.BATCH 和 args.VAL_BATCH 大小需要一致 ''' flyai库中的提供的数据处理方法 传入整个数据训练多少轮,每批次批大小 ''' dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH, val_batch=args.VAL_BATCH) model = Model(dataset) # 超参数 que_dict, ans_dict = load_dict() encoder_vocab_size = len(que_dict) decoder_vocab_size = len(ans_dict) # Batch Size, batch_size = args.BATCH # RNN Size rnn_size = 64 # Number of Layers num_layers = 3 # Embedding Size encoding_embedding_size = 64
# -*- coding: utf-8 -* from flyai.source.source import Source from flyai.utils.yaml_helper import Yaml from flyai.dataset import Dataset from model import Model print('调用了predict') data = Dataset() model = Model(data) p = model.predict_all(data.get_all_data()[0]) print(p)
def __init__(self, args): self.args = args self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)
''' 实现模型的调用 ''' from flyai.dataset import Dataset from model import Model data = Dataset() model = Model(data) p = model.predict( text='gute lage im stadtzentrum. shoppingmeile und sehensw rdigkeiten, sowie gute pubs in laufweite. das hotel ist neu, gut gepflegt und hat bem htes nettes personal. ideal f r einen kurztrip nach edinburgh. l ngere aufenthalte eher nicht, da die zimmer recht klein sind.') print(p)
# author=yphacker import argparse import numpy as np import tensorflow as tf from flyai.dataset import Dataset import config from model import Model from bert_model import BertModel parser = argparse.ArgumentParser() parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size") parser.add_argument("-e", "--EPOCHS", default=8, type=int, help="train epochs") args = parser.parse_args() dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) modelpp = Model(dataset) model = BertModel() def learning_rate_decay(learning_rate): return learning_rate * 0.5 def evaluate(sess): """评估在某一数据上的准确率和损失""" x_val_all, y_val_all = dataset.get_all_validation_data() data_len = len(y_val_all) index = np.random.permutation(len(y_val_all)) n_batches = len(y_val_all) // args.BATCH + 1 total_loss = 0.0
from flyai.dataset import Dataset from tensorflow.contrib.rnn import DropoutWrapper import tensorflow as tf from model import Model from path import MODEL_PATH, LOG_PATH import config from utils import load_word2vec_embedding import numpy as np # 超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=30, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=128, type=int, help="batch size") args = parser.parse_args() # 数据获取辅助类 dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) # 模型操作辅助类 modelpp = Model(dataset) ''' 使用tensorflow实现自己的算法 ''' # 得到训练和测试的数据 unit_num = config.embeddings_size # 默认词向量的大小等于RNN(每个time step) 和 CNN(列) 中神经单元的个数, 为了避免混淆model中全部用unit_num表示。 time_step = config.max_sequence # 每个句子的最大长度和time_step一样,为了避免混淆model中全部用time_step表示。 DROPOUT_RATE = config.dropout LEARN_RATE=config.leanrate TAGS_NUM = config.label_len
def eval_one_batch(preds): dataset = Dataset() model = Model(dataset) try: x_test, y_test = dataset.evaluate_data_no_processor("test.csv") print('eval.py use test.csv') except: x_test, y_test = dataset.evaluate_data_no_processor("dev.csv") print('eval.py use dev.csv') randnum = random.randint(0, 100) random.seed(randnum) random.shuffle(x_test) random.seed(randnum) random.shuffle(y_test) # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...] preds = model.predict_all(x_test) # 加载标签 [{'boxes':[], 'labels':[], 'image_id':[]}, ...] targets = [] for i in range(len(y_test)): label_path = y_test[i]['label_path'] # label/019646.jpg.txt boxes = [] labels = [] image_id = [] image_id.append(x_test[i]['image_path']) with open(os.path.join(DATA_PATH, label_path)) as f: for line in f.readlines(): # 1954.7443195924375,695.1497671989313,1984.659514688955,738.4779589540301,1933 temp = line.strip().split(',') xmin = int(float(temp[0])) ymin = int(float(temp[1])) xmax = int(float(temp[2])) ymax = int(float(temp[3])) boxes.append([xmin, ymin, xmax, ymax]) labels.append(int(temp[4])) target = {} target["boxes"] = boxes target["labels"] = labels target["image_id"] = image_id targets.append(target) ''' if不需要修改 ''' if len(y_test) != len(x_test): result = dict() result['score'] = 0 result['label'] = "评估违规" result['info'] = "" print(json.dumps(result)) else: ''' 在下面实现不同的评估算法 ''' # 开始计算最后得分 sum_ap = 0 all_labels = [i for i in range(2)] # 所有目标类别 # 以下是我自己加的 for label in all_labels: # 逐个类别计算ap prediction1 = [] # 在计算 ap 的时候,需要把prediction按照最后预测的类别进行筛选 for pred in preds: if pred[3] == label: prediction1.append([ pred[0], pred[1], pred[2][0], pred[2][1], pred[2][2], pred[2][3] ]) if len(prediction1) != 0: # 当包含预测框的时候,进行计算ap值 rec, prec, ap = voc_eval(targets, prediction1, label) else: ap = 0 sum_ap += ap map = sum_ap / len(all_labels) result = dict() result['score'] = round(map * 100, 2) result['label'] = "The Score is MAP." result['info'] = "" print(json.dumps(result)) return map
import torch import torch.nn as nn from flyai.dataset import Dataset from torch.optim import Adam from model import Model from net import resnet18 from path import MODEL_PATH from transformation import src import matplotlib.pyplot as plt # 导入flyai打印日志函数的库 from flyai.utils.log_helper import train_log # 数据获取辅助类 dataset = Dataset() # 模型操作辅助类 model = Model(dataset) # 超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=100, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=24, type=int, help="batch size") args = parser.parse_args() # 判断gpu是否可用
第一次使用请看项目中的:第一次使用请读我.html文件 常见问题请访问:https://www.flyai.com/question 意见和问题反馈有红包哦!添加客服微信:flyaixzs ''' ''' 项目中的超参 ''' parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=64, type=int, help="batch size") args = parser.parse_args() ''' flyai库中的提供的数据处理方法 传入整个数据训练多少轮,每批次批大小 ''' dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) model = Model(dataset) word_dict, word_dict_res = load_dict() vocab_size = max(word_dict.values()) + 1 # 超参 embedding_dim = 64 # 嵌入层大小 dnn_dim = 128 # Dense层大小 max_seq_len = 128 # 最大句长 num_filters = 64 # 卷积核数目 kernel_size = 5 # 卷积核尺寸 learning_rate = 1e-3 # 学习率 numclass = 2 # 类别数 # 传值空间
return labels ''' 保存模型的方法 ''' def save_model(self, model, path, name=KERAS_MODEL_NAME, overwrite=False): super().save_model(model, path, name, overwrite) model.save(os.path.join(path, name)) if __name__ == '__main__': print('ojbk') dataset = Dataset(epochs=5, batch=16) model = Model(dataset) p = model.predict_all([]) print(p) x,y = dataset.next_train_batch() a = { 'images/00007635_001.png': 0, 'images/00002573_000.png': 0, 'images/00000368_005.png': 0, } a1 = { 'images/00007635_001.png':0 } aa = [