Example #1
0
class Instructor(object):
    """
    特点: 使用flyai字典的get all data | flyai提供的next batch
    """
    def __init__(self, args):
        self.args = args

        self.dataset = Dataset(epochs=self.args.EPOCHS,
                               batch=self.args.BATCH,
                               val_batch=self.args.BATCH)

    def run(self):
        best_err1 = 100.
        best_epoch = 0

        logger.info('==> creating model "{}"'.format(args.model_name))
        model = Util.getModel(**vars(args))

        model = model.to(DEVICE)
        # 大部分情况下,设置这个flag可以让内置的cuDNN的auto - tuner自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题。
        cudnn.benchmark = True
        # define loss function (criterion) and pptimizer
        # criterion = nn.CrossEntropyLoss().to(DEVICE)
        # 标签平滑
        criterion = LabelSmoothingLoss(classes=self.args.num_classes,
                                       smoothing=0.1)
        # Focal Loss
        # criterion = FocalLoss(class_num=self.args.num_classes)

        # define optimizer
        optimizer = Util.getOptimizer(model=model, args=self.args)

        trainer = Trainer_1(dataset=self.dataset,
                            criterion=criterion,
                            optimizer=optimizer,
                            args=self.args,
                            logger=logger)
        logger.info('train: {} test: {}'.format(
            self.dataset.get_train_length(),
            self.dataset.get_validation_length()))
        for epoch in range(0, self.args.EPOCHS):
            # train for one epoch
            model = trainer.train(model=model, epoch=epoch)

            # evaluate on validation set
            model, val_err1 = trainer.test(model=model, epoch=epoch)

            # remember best err@1 and save checkpoint
            is_best = val_err1 < best_err1
            if is_best:
                best_err1 = val_err1
                best_epoch = epoch
                logger.info('Best var_err1 {}'.format(best_err1))
            Util.save_checkpoint(model.state_dict(), is_best,
                                 args.output_models_dir)
            if not is_best and epoch - best_epoch >= args.patience > 0:
                break

        logger.info('Best val_err1: {:.4f} at epoch {}'.format(
            best_err1, best_epoch))
    def generate(self):
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)
        source, target, _, _ = self.dataset.get_all_data()
        source = np.asarray([i['source'].split(' ') for i in source])
        target = np.asarray([i['target'].split(' ') for i in target])

        index = [i for i in range(len(source))]
        np.random.shuffle(np.asarray(index))
        train_source, dev_source = source[index[0:int(len(index) * 0.9)]], source[index[int(len(index) * 0.9):]]
        train_target, dev_target = target[index[0:int(len(index) * 0.9)]], target[index[int(len(index) * 0.9):]]

        return train_source, train_target, dev_source, dev_target
    def __init__(self, exec_type='train'):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs")
        parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size")
        self.args = parser.parse_args()
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH)
        self.model_dir = os.path.join(os.getcwd(), arguments.model_dir)

        # 1. Split the data, read into defined format
        label2idx = dict((arguments.labels[i], i) for i in range(len(arguments.labels)))
        target_text, stance, _, _ = self.dataset.get_all_data()

        indexes = [" ".join(jieba.cut(i['TARGET'].lower(), cut_all=False)) for i in target_text]
        questions = [" ".join(jieba.cut(i['TEXT'].lower(), cut_all=False)) for i in target_text]
        labels = [i['STANCE'] for i in stance]
        data = [indexes, questions, labels]
        assert len(data[0]) == len(data[1]) == len(data[2])

        # 2. Data follows this order: train, test
        train_num = int(len(data[0]) * arguments.portion)
        train_data = [d[:train_num] for d in data]
        dev_data = [d[train_num:] for d in data]

        # 3. Read the vocab text file and get VOCAB dictionary
        vocab = read_emb(filename=os.path.join(os.getcwd(), arguments.sgns_dir), stat_lines=1)

        # 4. Transform text into indexes
        self.datasets, word2idx, embeddings = make_datasets(vocab=vocab,
                                                            raw_data={'training': train_data, 'validation': dev_data},
                                                            label2idx=label2idx,
                                                            big_voc=arguments.big_voc,
                                                            feat_names=arguments.feat_names)
        self.datasets_train = load_tvt(tvt_set=self.datasets['training'],
                                       max_lens=[arguments.ans_len, arguments.ask_len],
                                       feat_names=arguments.feat_names)
        self.datasets_dev = load_tvt(tvt_set=self.datasets['validation'],
                                     max_lens=[arguments.ans_len, arguments.ask_len],
                                     feat_names=arguments.feat_names)

        idx2word = dict((v, k) for k, v in word2idx.items())
        self.datasets["word2idx"] = word2idx
        self.datasets["idx2word"] = idx2word

        self.embeddings = torch.from_numpy(np.asarray(embeddings, dtype=np.float32))

        if exec_type == 'train':
            self.main()
        else:
            model = load_torch_model(self.model_dir)
            test(model=model, dataset=self.datasets, test_set=None)
    def __init__(self, arguments):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e",
                            "--EPOCHS",
                            default=5,
                            type=int,
                            help="train epochs")
        parser.add_argument("-b",
                            "--BATCH",
                            default=2,
                            type=int,
                            help="batch size")
        self.args = parser.parse_args()
        self.arguments = arguments
        self.dataset = Dataset(epochs=self.args.EPOCHS,
                               batch=self.args.BATCH,
                               val_batch=self.args.BATCH)

        if 'bert' in self.arguments.model_name:
            self.tokenizer = Tokenizer4Bert(
                max_seq_len=self.arguments.max_seq_len,
                pretrained_bert_name=os.path.join(
                    os.getcwd(), self.arguments.pretrained_bert_name))
            bert = BertModel.from_pretrained(pretrained_model_name_or_path=self
                                             .arguments.pretrained_bert_name)
            self.model = self.arguments.model_class(bert, self.arguments).to(
                self.arguments.device)
        else:
            self.tokenizer = Util.bulid_tokenizer(
                fnames=[
                    self.arguments.dataset_file['train'],
                    self.arguments.dataset_file['test']
                ],
                max_seq_len=self.arguments.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset))
            embedding_matrix = Util.build_embedding_matrix(
                word2idx=self.tokenizer.word2idx,
                embed_dim=self.arguments.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(self.arguments.embed_dim), self.arguments.dataset))
            self.model = self.arguments.model_class(
                embedding_matrix, self.arguments).to(self.arguments.device)

        if self.arguments.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(
                    device=self.arguments.device.index)))

        Util.print_args(model=self.model, logger=logger, args=self.arguments)
def DatasetExtendToSize(readCsvOnLocal=True,
                        train_size=32,
                        val_size=32,
                        classify_count=10):
    '''

    :param readCsvOnLocal: 设置True运行在本地使用,设置FALSE 运行在flyai服务器上使用
    :param size: 每类的数据集扩容到size大小
    :param classify_count: 分类的数量
    :return: flyai的dataset类
    '''
    # step 0 : read csv
    # flyai_source = readCsv_onFlyai(readCsvOnLocal)
    flyai_source = SourceByWangyi().source_csv
    # step 1 : csv to dataframe
    dataframe_train = pd.DataFrame(data=flyai_source.data)
    dataframe_test = pd.DataFrame(data=flyai_source.val)
    # step 2 : extend csv(dataframe)
    dataframe_train = ExtendCsvToSize(dataframe_train,
                                      size=train_size,
                                      classify_count=classify_count)
    dataframe_test = ExtendCsvToSize(dataframe_test,
                                     size=val_size,
                                     classify_count=classify_count)
    # step 3 : save csv
    dataframe_train.to_csv(os.path.join(DATA_PATH, 'wangyi-train.csv'),
                           index=False)
    dataframe_test.to_csv(os.path.join(DATA_PATH, 'wangyi-test.csv'),
                          index=False)
    # step 4 : load to flyai.dataset
    dataset_extend_newone = Dataset(
        source=readCustomCsv("wangyi-train.csv", "wangyi-test.csv"))
    return dataset_extend_newone
    def predict_to_csv(self):
        save_file_name = 'upload-by-' + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())+'.csv')
        save_file_name = os.path.join(os.curdir, 'data', 'output', save_file_name)

        # 1. 创建文件对象
        with open(save_file_name, 'w', encoding='utf-8',newline='' "") as f:


            # 2. 基于文件对象构建 csv写入对象
            csv_writer = csv.writer(f)

            # 3. 构建列表头
            csv_writer.writerow(["image_path", "labels"])

            url_list = self.read_predict_Csv()
            dataset = Dataset(epochs=5, batch=16)
            model = Model(dataset)
            predict_list = []
            for row in url_list:
                predict_num = model.predict(image_path=row)
                # csv_writer.writerow([row, str(predict_num)])
                # predict_list.append(predict_num)
                predict_list.append([row,predict_num])
                # 打印进度条
                count_now = len(predict_list)
                count_total = len(url_list)
                print('\r'+'预测集进度:'+str(count_now)+'/'+ str(count_total),
                      '----{:.2%}'.format(count_now/count_total),end='', flush=True)
            csv_writer.writerows(predict_list)
        print('\n已保存CSV到 ',save_file_name)
    def generate(self):
        self.data = Dataset(epochs=self.args.EPOCHS,
                            batch=self.args.BATCH,
                            val_batch=self.args.BATCH)
        audio_paths, labels, _, _ = self.data.get_all_data()

        # wav文件路径
        audio_paths = [i['audio_path'] for i in audio_paths]
        # wav文本数据 TODO:此处包含空格, 测试去掉空格能否提升模型性能
        audio_labels = []
        # wav文本拼音
        audio_pinyins = []
        for label in labels:
            label = label['label'].split(' ')
            audio_labels.append(''.join(label))
            audio_pinyins.append(' '.join([
                ' '.join([
                    ' '.join(j)
                    for j in pinyin(i, style=Style.TONE3, heteronym=False)
                ]) for i in label
            ]))

        # 构建字典
        for label in labels:
            self.sortedDict.append_tokens(label)
        self.sortedDict.dump_pkl()

        # 划分训练/验证
        audio_paths = np.asarray(audio_paths)
        audio_labels = np.asarray(audio_labels)
        audio_pinyins = np.asarray(audio_pinyins)

        index = [i for i in range(len(audio_paths))]
        np.random.shuffle(np.asarray(index))
        train_audio_paths, dev_audio_paths = audio_paths[
            index[0:int(len(index) *
                        0.9)]], audio_paths[index[int(len(index) * 0.9):]]
        train_labels, dev_labels = audio_labels[
            index[0:int(len(index) *
                        0.9)]], audio_labels[index[int(len(index) * 0.9):]]
        train_pinyins, dev_pinyins = audio_pinyins[
            index[0:int(len(index) *
                        0.9)]], audio_pinyins[index[int(len(index) * 0.9):]]

        return train_audio_paths.tolist(), train_labels.tolist(
        ), train_pinyins.tolist(), dev_audio_paths.tolist(), dev_labels.tolist(
        ), dev_pinyins.tolist()
    def __init__(self, arguments):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs")
        parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size")
        self.args = parser.parse_args()
        self.arguments = arguments
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)

        if 'bert' in self.arguments.model_name:
            self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len,
                                            pretrained_bert_name=os.path.join(os.getcwd(),
                                                                              self.arguments.pretrained_bert_name))
            bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name)
            self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device)
        else:
            self.tokenizer = Util.bulid_tokenizer(
                fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']],
                max_seq_len=self.arguments.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)
            )
            embedding_matrix = Util.build_embedding_matrix(
                word2idx=self.tokenizer.word2idx,
                embed_dim=self.arguments.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset)
            )
            self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device)

        if self.arguments.device.type == 'cuda':
            logger.info(
                'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index)))

        Util.print_args(model=self.model, logger=logger, args=self.arguments)

        target_text, stance, _, _ = self.dataset.get_all_data()
        target = np.asarray([i['TARGET'].lower() for i in target_text])
        text = np.asarray([i['TEXT'].lower() for i in target_text])
        stance = np.asarray([i['STANCE'] for i in stance])
        self.target_set = set()
        for tar in target:
            self.target_set.add(tar)
        text = PreProcessing(text).get_file_text()
        trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer)

        valset_len = int(len(trainset) * self.arguments.valset_ratio)
        self.trainset, self.valset = random_split(trainset, (len(trainset) - valset_len, valset_len))
Example #9
0
    def generate(self):
        self.dataset = Dataset(epochs=self.arguments.EPOCHS,
                               batch=self.arguments.BATCH,
                               val_batch=self.arguments.BATCH)
        news, category, _, _ = self.dataset.get_all_data()
        news = np.asarray([i['news'] for i in news])
        category = np.asarray([i['category'] for i in category])

        index = [i for i in range(len(news))]
        np.random.shuffle(np.asarray(index))
        train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[
            index[int(len(index) * 0.9):]]
        train_category, dev_category = category[
            index[0:int(len(index) *
                        0.9)]], category[index[int(len(index) * 0.9):]]

        return train_news, train_category, dev_news, dev_category
def getDatasetListByClassfy(classify_count=3):
    test_source = SourceByWangyi()
    xx, yy = test_source.get_sliceCSVbyClassify(classify_count=classify_count)
    list_tmp = []
    for epoch in range(classify_count):
        dataset = Dataset(source=readCustomCsv(xx[epoch], yy[epoch]))
        list_tmp.append(dataset)

    return list_tmp
Example #11
0
class Predictor(object):
    def __init__(self, path=MODEL_PATH, name='final.h5'):
        self.data = Dataset()
        from keras.utils import CustomObjectScope
        with CustomObjectScope({'AttLayer': AttLayer}):
            self.model = load_model(os.path.join(path, name))

    def predict(self, **data):
        p = self.model.predict(self.data.predict_data(**data))
        return p

    def to_category(self, p):
        y = self.data.to_categorys(p)
        return y

    def predict_to_category(self, **data):
        p = self.predict(**data)
        y = self.to_category(p)
        return y
    def __init__(self, exec_type="train"):
        parser = argparse.ArgumentParser()
        parser.add_argument("-e",
                            "--EPOCHS",
                            default=10,
                            type=int,
                            help="train epochs")
        parser.add_argument("-b",
                            "--BATCH",
                            default=24,
                            type=int,
                            help="batch size")
        args = parser.parse_args()

        self.batch_size = args.BATCH
        self.epochs = args.EPOCHS

        self.learning_rate = arguments.learning_rate
        self.embedding_size = arguments.embedding_size
        self.hidden_size = arguments.hidden_size
        self.tags = arguments.tags
        self.dropout = arguments.dropout
        self.tag_map = {label: i for i, label in enumerate(arguments.labels)}

        if exec_type == "train":
            self.model = Net(
                tag_map=self.tag_map,
                batch_size=self.batch_size,
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
        else:
            self.model = None

        self.dataset = Dataset(epochs=self.epochs, batch=self.batch_size)
Example #13
0
# -*- coding: utf-8 -*
'''
实现模型的调用
'''
from flyai.dataset import Dataset
from model import Model

data = Dataset()
model = Model(data)

dataset = Dataset()
x_test, y_test = dataset.evaluate_data_no_processor('dev.csv')
preds = model.predict_all(x_test)

Example #14
0
 parser.add_argument("-e",
                     "--EPOCHS",
                     default=5,
                     type=int,
                     help="train epochs")
 parser.add_argument("-b",
                     "--BATCH",
                     default=16,
                     type=int,
                     help="batch size")
 args = parser.parse_args()
 '''
 flyai库中的提供的数据处理方法
 传入整个数据训练多少轮,每批次批大小
 '''
 dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)
 print(
     f'train datas: {dataset.get_train_length()}, val data: {dataset.get_validation_length()}'
 )
 lr = 1e-4
 num_warmup_steps = 1000
 max_grad = 1.0
 '''
 实现自己的网络机构
 '''
 # 判断gpu是否可用
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 net = Net().to(device)
 model = Model(dataset, net)
 # print(net)
 # optimizer = torch.optim.Adam(net.parameters(), lr=5e-6)
Example #15
0
import argparse
import tensorflow as tf
from flyai.dataset import Dataset

from model import Model

# 数据获取辅助类
from path import MODEL_PATH

dataset = Dataset()
dataset.get_all_processor_data()
model = Model(dataset)

parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=10, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size")
args = parser.parse_args()
'''
使用tensorflow实现自己的算法

'''
# 定义命名空间
x = tf.placeholder(tf.float32, shape=[None, 200, 200, 3], name='input_x')
y = tf.placeholder(tf.int64, shape=[None], name='input_y')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')


# 初始化权值
def weight_variable(shape, name):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial, name=name)
常见问题请访问:https://www.flyai.com/question
意见和问题反馈有红包哦!添加客服微信:flyaixzs
'''
'''
项目的超参
'''
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=1, type=int, help="batch size")
args = parser.parse_args()
'''
flyai库中的提供的数据处理方法
传入整个数据训练多少轮,每批次批大小
'''
print('batch_size: %d, epoch_size: %d' % (args.BATCH, args.EPOCHS))
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH, val_batch=32)
model = Model(dataset)

print("number of train examples:%d" % dataset.get_train_length())
print("number of validation examples:%d" % dataset.get_validation_length())
# region 超参数
n_classes = 45
fc1_dim = 512
# endregion

# region 定义输入变量
x_inputs = tf.placeholder(shape=(None, 224, 224, 3),
                          dtype=tf.float32,
                          name='x_inputs')
y_inputs = tf.placeholder(shape=(None, n_classes),
                          dtype=tf.float32,
Example #17
0
'''
项目中的超参
'''
parser = argparse.ArgumentParser()
parser.add_argument("-e",
                    "--EPOCHS",
                    default=50,
                    type=int,
                    help="train epochs")
parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size")
args = parser.parse_args()
'''
flyai库中的提供的数据处理方法
传入整个数据训练多少轮,每批次批大小
'''
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)
model = Model(dataset)
'''
实现自己的网络机构
'''
x = tf.placeholder(tf.float32, shape=[None, 28, 28, 1], name='input_x')
y = tf.placeholder(tf.float32, shape=[None, 10], name='input_y')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
learning_rate = 0.001
'''
dataset.get_step() 获取数据的总迭代次数

'''


# 参数概要
Example #18
0
            else:
                ap = 0
            sum_ap += ap
        map = sum_ap / len(all_labels)

        result = dict()
        result['score'] = round(map * 100, 2)
        result['label'] = "The Score is MAP."
        result['info'] = ""
        print(json.dumps(result))
        return map


if __name__ == "__main__":

    dataset = Dataset()
    model = Model(dataset)
    try:
        x_test, y_test = dataset.evaluate_data_no_processor("test.csv")
        print('eval.py use test.csv')
    except:
        x_test, y_test = dataset.evaluate_data_no_processor("dev.csv")
        print('eval.py use dev.csv')
    randnum = random.randint(0, 100)
    random.seed(randnum)
    random.shuffle(x_test)
    random.seed(randnum)
    random.shuffle(y_test)

    # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...]
    preds = model.predict_all(x_test)
Example #19
0
class Instructor(object):
    """
    特点:使用flyai字典的get all data  | 自己进行划分next batch
    """
    def __init__(self, arguments):
        self.arguments = arguments

    def train(self,
              train_category,
              dev_category,
              train_news,
              dev_news,
              tokenizer,
              Net=None,
              model=None):
        if os.path.exists(self.arguments.output_config_file) is True:
            os.remove(self.arguments.output_config_file)

        logger.info('>>train.shape: {} | dev.shape: {}'.format(
            train_category.shape, dev_category.shape))
        train_dataloader, train_examples_len = Util.load_data(
            news=train_news,
            category=train_category,
            data_type='train',
            label_list=self.arguments.label_list,
            max_length=self.arguments.max_seq_length,
            tokenizer=tokenizer,
            batch_size=self.arguments.BATCH)
        dev_dataloader, dev_examples_len = Util.load_data(
            news=dev_news,
            category=dev_category,
            data_type='dev',
            label_list=self.arguments.label_list,
            max_length=self.arguments.max_seq_length,
            tokenizer=tokenizer,
            batch_size=self.arguments.BATCH)

        num_train_optimization_steps = int(
            train_examples_len / self.arguments.BATCH /
            self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS

        # 模型准备
        logger.info("model name is {}".format(self.arguments.model_name))

        if model is None:
            if self.arguments.model_name == "BertOrigin":
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == 'BertHAN':
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == "BertCNN":
                filter_sizes = [
                    int(val) for val in self.arguments.filter_sizes.split()
                ]
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    n_filters=self.arguments.filter_num,
                    filter_sizes=filter_sizes,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == "BertATT":
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == "BertRCNN":
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir,
                    rnn_hidden_size=self.arguments.rnn_hidden_size,
                    num_layers=self.arguments.num_layers,
                    bidirectional=self.arguments.bidirectional,
                    dropout=self.arguments.dropout)

            elif self.arguments.model_name == "BertCNNPlus":
                filter_sizes = [
                    int(val) for val in self.arguments.filter_sizes.split()
                ]
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir,
                    n_filters=self.arguments.filter_num,
                    filter_sizes=filter_sizes)

        model.to(DEVICE)
        """ 优化器准备 """
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # To reproduce BertAdam specific behavior set correct_bias=False
        optimizer = AdamW(params=optimizer_grouped_parameters,
                          lr=self.arguments.learning_rate,
                          correct_bias=False)
        # PyTorch scheduler
        scheduler = WarmupLinearSchedule(
            optimizer=optimizer,
            warmup_steps=self.arguments.warmup_proportion,
            t_total=num_train_optimization_steps)
        """ 损失函数准备 """
        if self.arguments.use_label_smoothing:
            criterion = NMTCriterion(
                label_smoothing=self.arguments.label_smoothing)
        else:
            criterion = nn.CrossEntropyLoss()
        criterion = criterion.to(DEVICE)

        best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0
        for epoch in range(int(self.arguments.EPOCHS)):
            if early_stop_times >= self.arguments.early_stop * (
                    train_examples_len // self.arguments.BATCH):
                break

            logger.info(f'---------------- Epoch: {epoch + 1:02} ----------')

            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                if self.arguments.label_smoothing:
                    criterion.train()

                batch = tuple(t.to(DEVICE) for t in batch)
                _, input_ids, input_mask, segment_ids, label_ids = batch

                logits = model(input_ids, segment_ids, input_mask, labels=None)
                loss = criterion(inputs=logits,
                                 labels=label_ids,
                                 normalization=1.0,
                                 reduce=False)

                # 修正
                if self.arguments.gradient_accumulation_steps > 1:
                    loss = loss / self.arguments.gradient_accumulation_steps

                loss.backward(torch.ones_like(loss))
                scheduler.step()

                if (step +
                        1) % self.arguments.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                    if global_step % self.arguments.print_step == 0 and global_step != 0:
                        dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate(
                            model,
                            dev_dataloader,
                            criterion,
                            DEVICE,
                            self.arguments.label_list,
                            args=self.arguments)
                        logger.info('\n>>>dev report: \n{}'.format(dev_report))
                        # 以 acc 取优
                        if dev_acc > best_acc:
                            best_acc = dev_acc
                            # 以 auc 取优
                            if dev_auc > best_auc:
                                best_auc = dev_auc

                            # 保存模型
                            model_to_save = model.module if hasattr(
                                model, 'module') else model
                            torch.save(model_to_save.state_dict(),
                                       self.arguments.output_model_file)
                            with open(self.arguments.output_config_file,
                                      'w') as f:
                                f.write(model_to_save.config.to_json_string())

                            early_stop_times = 0
                        else:
                            early_stop_times += 1

        if os.path.exists(self.arguments.output_config_file) is False:
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(),
                       self.arguments.output_model_file)
            with open(self.arguments.output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())

    def generate(self):
        self.dataset = Dataset(epochs=self.arguments.EPOCHS,
                               batch=self.arguments.BATCH,
                               val_batch=self.arguments.BATCH)
        news, category, _, _ = self.dataset.get_all_data()
        news = np.asarray([i['news'] for i in news])
        category = np.asarray([i['category'] for i in category])

        index = [i for i in range(len(news))]
        np.random.shuffle(np.asarray(index))
        train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[
            index[int(len(index) * 0.9):]]
        train_category, dev_category = category[
            index[0:int(len(index) *
                        0.9)]], category[index[int(len(index) * 0.9):]]

        return train_news, train_category, dev_news, dev_category

    def run(self):
        remote_helper.get_remote_date(
            "https://www.flyai.com/m/chinese_base.zip")
        before_vocab_dir = os.path.join(os.getcwd(), 'vocab.txt')
        after_vocab_dir = os.path.join(args.bert_model_dir, 'vocab.txt')
        logger.info('>before_vocab_dir:{}'.format(before_vocab_dir))
        logger.info('>after_vocab_dir:{}'.format(after_vocab_dir))

        shutil.copyfile(before_vocab_dir, after_vocab_dir)

        if not os.path.exists(self.arguments.output_dir):
            os.mkdir(self.arguments.output_dir)

        self.arguments.BATCH = self.arguments.BATCH // self.arguments.gradient_accumulation_steps

        # 数据准备  分词器选择
        tokenizer = BertTokenizer(
            self.arguments.bert_vocab_file).from_pretrained(
                self.arguments.bert_model_dir,
                do_lower_case=self.arguments.do_lower_case)
        # 获取数据 news/keywords
        train_news, train_category, dev_news, dev_category = self.generate()

        self.train(Net=Net,
                   train_category=train_category,
                   dev_category=dev_category,
                   train_news=train_news,
                   dev_news=dev_news,
                   tokenizer=tokenizer)
Example #20
0
from flyai.dataset import Dataset

from model import Model

data = Dataset()
model = Model(data)
# p = model.predict(age=53, sex=1, cp=3, trestbps=130, chol=246, fbs=1, restecg=2, thalach=173, exang=0, oldpeak=0.0,
#                   slope=1, ca=3, thal=3)
# print(p)

tData = data.get_all_data()
preds = model.predict_all(tData[0])

y_test = []
for label in tData[1]:
    y_test.append(label['label'])

rCount = 0.0
for i in range(0, len(preds)):
    if preds[i] == y_test[i]:
        rCount += 1.

test_acc = rCount / len(preds)

print('accuracy %g' % test_acc)
Example #21
0
                    type=int,
                    help="train epochs")
parser.add_argument("-b", "--BATCH", default=64, type=int, help="batch size")
parser.add_argument("-vb",
                    "--VAL_BATCH",
                    default=64,
                    type=int,
                    help="val batch size")
args = parser.parse_args()
#  在本样例中, args.BATCH 和 args.VAL_BATCH 大小需要一致
'''
flyai库中的提供的数据处理方法
传入整个数据训练多少轮,每批次批大小
'''
dataset = Dataset(epochs=args.EPOCHS,
                  batch=args.BATCH,
                  val_batch=args.VAL_BATCH)
model = Model(dataset)

# 超参数
que_dict, ans_dict = load_dict()
encoder_vocab_size = len(que_dict)
decoder_vocab_size = len(ans_dict)
# Batch Size,
batch_size = args.BATCH
# RNN Size
rnn_size = 64
# Number of Layers
num_layers = 3
# Embedding Size
encoding_embedding_size = 64
Example #22
0
# -*- coding: utf-8 -*
from flyai.source.source import Source
from flyai.utils.yaml_helper import Yaml
from flyai.dataset import Dataset
from model import Model

print('调用了predict')

data = Dataset()
model = Model(data)

p = model.predict_all(data.get_all_data()[0])
print(p)
Example #23
0
    def __init__(self, args):
        self.args = args

        self.dataset = Dataset(epochs=self.args.EPOCHS,
                               batch=self.args.BATCH,
                               val_batch=self.args.BATCH)
Example #24
0
'''
实现模型的调用
'''
from flyai.dataset import Dataset

from model import Model

data = Dataset()
model = Model(data)
p = model.predict(
    text='gute lage im stadtzentrum. shoppingmeile und sehensw  rdigkeiten, sowie gute pubs in laufweite. das hotel ist neu, gut gepflegt und hat bem  htes nettes personal. ideal f  r einen kurztrip nach edinburgh. l  ngere aufenthalte eher nicht, da die zimmer recht klein sind.')
print(p)
Example #25
0
# author=yphacker

import argparse
import numpy as np
import tensorflow as tf
from flyai.dataset import Dataset
import config
from model import Model
from bert_model import BertModel

parser = argparse.ArgumentParser()
parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size")
parser.add_argument("-e", "--EPOCHS", default=8, type=int, help="train epochs")
args = parser.parse_args()

dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)
modelpp = Model(dataset)
model = BertModel()


def learning_rate_decay(learning_rate):
    return learning_rate * 0.5


def evaluate(sess):
    """评估在某一数据上的准确率和损失"""
    x_val_all, y_val_all = dataset.get_all_validation_data()
    data_len = len(y_val_all)
    index = np.random.permutation(len(y_val_all))
    n_batches = len(y_val_all) // args.BATCH + 1
    total_loss = 0.0
Example #26
0
from flyai.dataset import Dataset
from tensorflow.contrib.rnn import DropoutWrapper
import tensorflow as tf
from model import Model
from path import MODEL_PATH, LOG_PATH
import config
from utils import load_word2vec_embedding
import numpy as np

# 超参
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=30, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=128, type=int, help="batch size")
args = parser.parse_args()
# 数据获取辅助类
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)

# 模型操作辅助类
modelpp = Model(dataset)


'''
使用tensorflow实现自己的算法

'''
# 得到训练和测试的数据
unit_num = config.embeddings_size      # 默认词向量的大小等于RNN(每个time step) 和 CNN(列) 中神经单元的个数, 为了避免混淆model中全部用unit_num表示。
time_step = config.max_sequence      # 每个句子的最大长度和time_step一样,为了避免混淆model中全部用time_step表示。
DROPOUT_RATE = config.dropout
LEARN_RATE=config.leanrate
TAGS_NUM = config.label_len
Example #27
0
def eval_one_batch(preds):
    dataset = Dataset()
    model = Model(dataset)
    try:
        x_test, y_test = dataset.evaluate_data_no_processor("test.csv")
        print('eval.py use test.csv')
    except:
        x_test, y_test = dataset.evaluate_data_no_processor("dev.csv")
        print('eval.py use dev.csv')
    randnum = random.randint(0, 100)
    random.seed(randnum)
    random.shuffle(x_test)
    random.seed(randnum)
    random.shuffle(y_test)

    # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...]
    preds = model.predict_all(x_test)

    # 加载标签 [{'boxes':[], 'labels':[], 'image_id':[]}, ...]
    targets = []
    for i in range(len(y_test)):
        label_path = y_test[i]['label_path']  # label/019646.jpg.txt
        boxes = []
        labels = []
        image_id = []
        image_id.append(x_test[i]['image_path'])
        with open(os.path.join(DATA_PATH, label_path)) as f:
            for line in f.readlines():
                # 1954.7443195924375,695.1497671989313,1984.659514688955,738.4779589540301,1933
                temp = line.strip().split(',')
                xmin = int(float(temp[0]))
                ymin = int(float(temp[1]))
                xmax = int(float(temp[2]))
                ymax = int(float(temp[3]))
                boxes.append([xmin, ymin, xmax, ymax])
                labels.append(int(temp[4]))
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        targets.append(target)
    '''
    if不需要修改
    '''
    if len(y_test) != len(x_test):
        result = dict()
        result['score'] = 0
        result['label'] = "评估违规"
        result['info'] = ""
        print(json.dumps(result))
    else:
        '''
        在下面实现不同的评估算法
        '''
        # 开始计算最后得分
        sum_ap = 0
        all_labels = [i for i in range(2)]  # 所有目标类别

        # 以下是我自己加的

        for label in all_labels:  # 逐个类别计算ap
            prediction1 = []  # 在计算 ap 的时候,需要把prediction按照最后预测的类别进行筛选
            for pred in preds:
                if pred[3] == label:
                    prediction1.append([
                        pred[0], pred[1], pred[2][0], pred[2][1], pred[2][2],
                        pred[2][3]
                    ])
            if len(prediction1) != 0:  # 当包含预测框的时候,进行计算ap值
                rec, prec, ap = voc_eval(targets, prediction1, label)
            else:
                ap = 0
            sum_ap += ap
        map = sum_ap / len(all_labels)

        result = dict()
        result['score'] = round(map * 100, 2)
        result['label'] = "The Score is MAP."
        result['info'] = ""
        print(json.dumps(result))
        return map
Example #28
0
import torch
import torch.nn as nn
from flyai.dataset import Dataset
from torch.optim import Adam

from model import Model
from net import resnet18
from path import MODEL_PATH
from transformation import src
import matplotlib.pyplot as plt

# 导入flyai打印日志函数的库
from flyai.utils.log_helper import train_log

# 数据获取辅助类
dataset = Dataset()

# 模型操作辅助类
model = Model(dataset)

# 超参
parser = argparse.ArgumentParser()
parser.add_argument("-e",
                    "--EPOCHS",
                    default=100,
                    type=int,
                    help="train epochs")
parser.add_argument("-b", "--BATCH", default=24, type=int, help="batch size")
args = parser.parse_args()

# 判断gpu是否可用
Example #29
0
第一次使用请看项目中的:第一次使用请读我.html文件
常见问题请访问:https://www.flyai.com/question
意见和问题反馈有红包哦!添加客服微信:flyaixzs
'''
'''
项目中的超参
'''
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=64, type=int, help="batch size")
args = parser.parse_args()
'''
flyai库中的提供的数据处理方法
传入整个数据训练多少轮,每批次批大小
'''
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)
model = Model(dataset)

word_dict, word_dict_res = load_dict()
vocab_size = max(word_dict.values()) + 1

# 超参
embedding_dim = 64  # 嵌入层大小
dnn_dim = 128  # Dense层大小
max_seq_len = 128  # 最大句长
num_filters = 64  # 卷积核数目
kernel_size = 5  # 卷积核尺寸
learning_rate = 1e-3  # 学习率
numclass = 2  # 类别数

# 传值空间
Example #30
0
        return labels

    '''
    保存模型的方法
    '''

    def save_model(self, model, path, name=KERAS_MODEL_NAME, overwrite=False):
        super().save_model(model, path, name, overwrite)
        model.save(os.path.join(path, name))


if __name__ == '__main__':

    print('ojbk')
    dataset = Dataset(epochs=5, batch=16)
    model = Model(dataset)

    p = model.predict_all([])
    print(p)

    x,y = dataset.next_train_batch()
    a = {
        'images/00007635_001.png': 0,
        'images/00002573_000.png': 0,
        'images/00000368_005.png': 0,
    }
    a1 = {
        'images/00007635_001.png':0
    }
    aa = [