def DatasetExtendToSize(readCsvOnLocal=True,
                        train_size=32,
                        val_size=32,
                        classify_count=10):
    '''

    :param readCsvOnLocal: 设置True运行在本地使用,设置FALSE 运行在flyai服务器上使用
    :param size: 每类的数据集扩容到size大小
    :param classify_count: 分类的数量
    :return: flyai的dataset类
    '''
    # step 0 : read csv
    # flyai_source = readCsv_onFlyai(readCsvOnLocal)
    flyai_source = SourceByWangyi().source_csv
    # step 1 : csv to dataframe
    dataframe_train = pd.DataFrame(data=flyai_source.data)
    dataframe_test = pd.DataFrame(data=flyai_source.val)
    # step 2 : extend csv(dataframe)
    dataframe_train = ExtendCsvToSize(dataframe_train,
                                      size=train_size,
                                      classify_count=classify_count)
    dataframe_test = ExtendCsvToSize(dataframe_test,
                                     size=val_size,
                                     classify_count=classify_count)
    # step 3 : save csv
    dataframe_train.to_csv(os.path.join(DATA_PATH, 'wangyi-train.csv'),
                           index=False)
    dataframe_test.to_csv(os.path.join(DATA_PATH, 'wangyi-test.csv'),
                          index=False)
    # step 4 : load to flyai.dataset
    dataset_extend_newone = Dataset(
        source=readCustomCsv("wangyi-train.csv", "wangyi-test.csv"))
    return dataset_extend_newone
    def predict_to_csv(self):
        save_file_name = 'upload-by-' + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())+'.csv')
        save_file_name = os.path.join(os.curdir, 'data', 'output', save_file_name)

        # 1. 创建文件对象
        with open(save_file_name, 'w', encoding='utf-8',newline='' "") as f:


            # 2. 基于文件对象构建 csv写入对象
            csv_writer = csv.writer(f)

            # 3. 构建列表头
            csv_writer.writerow(["image_path", "labels"])

            url_list = self.read_predict_Csv()
            dataset = Dataset(epochs=5, batch=16)
            model = Model(dataset)
            predict_list = []
            for row in url_list:
                predict_num = model.predict(image_path=row)
                # csv_writer.writerow([row, str(predict_num)])
                # predict_list.append(predict_num)
                predict_list.append([row,predict_num])
                # 打印进度条
                count_now = len(predict_list)
                count_total = len(url_list)
                print('\r'+'预测集进度:'+str(count_now)+'/'+ str(count_total),
                      '----{:.2%}'.format(count_now/count_total),end='', flush=True)
            csv_writer.writerows(predict_list)
        print('\n已保存CSV到 ',save_file_name)
def getDatasetListByClassfy(classify_count=3):
    test_source = SourceByWangyi()
    xx, yy = test_source.get_sliceCSVbyClassify(classify_count=classify_count)
    list_tmp = []
    for epoch in range(classify_count):
        dataset = Dataset(source=readCustomCsv(xx[epoch], yy[epoch]))
        list_tmp.append(dataset)

    return list_tmp
    def generate(self):
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)
        source, target, _, _ = self.dataset.get_all_data()
        source = np.asarray([i['source'].split(' ') for i in source])
        target = np.asarray([i['target'].split(' ') for i in target])

        index = [i for i in range(len(source))]
        np.random.shuffle(np.asarray(index))
        train_source, dev_source = source[index[0:int(len(index) * 0.9)]], source[index[int(len(index) * 0.9):]]
        train_target, dev_target = target[index[0:int(len(index) * 0.9)]], target[index[int(len(index) * 0.9):]]

        return train_source, train_target, dev_source, dev_target
    def __init__(self, exec_type='train'):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs")
        parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size")
        self.args = parser.parse_args()
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH)
        self.model_dir = os.path.join(os.getcwd(), arguments.model_dir)

        # 1. Split the data, read into defined format
        label2idx = dict((arguments.labels[i], i) for i in range(len(arguments.labels)))
        target_text, stance, _, _ = self.dataset.get_all_data()

        indexes = [" ".join(jieba.cut(i['TARGET'].lower(), cut_all=False)) for i in target_text]
        questions = [" ".join(jieba.cut(i['TEXT'].lower(), cut_all=False)) for i in target_text]
        labels = [i['STANCE'] for i in stance]
        data = [indexes, questions, labels]
        assert len(data[0]) == len(data[1]) == len(data[2])

        # 2. Data follows this order: train, test
        train_num = int(len(data[0]) * arguments.portion)
        train_data = [d[:train_num] for d in data]
        dev_data = [d[train_num:] for d in data]

        # 3. Read the vocab text file and get VOCAB dictionary
        vocab = read_emb(filename=os.path.join(os.getcwd(), arguments.sgns_dir), stat_lines=1)

        # 4. Transform text into indexes
        self.datasets, word2idx, embeddings = make_datasets(vocab=vocab,
                                                            raw_data={'training': train_data, 'validation': dev_data},
                                                            label2idx=label2idx,
                                                            big_voc=arguments.big_voc,
                                                            feat_names=arguments.feat_names)
        self.datasets_train = load_tvt(tvt_set=self.datasets['training'],
                                       max_lens=[arguments.ans_len, arguments.ask_len],
                                       feat_names=arguments.feat_names)
        self.datasets_dev = load_tvt(tvt_set=self.datasets['validation'],
                                     max_lens=[arguments.ans_len, arguments.ask_len],
                                     feat_names=arguments.feat_names)

        idx2word = dict((v, k) for k, v in word2idx.items())
        self.datasets["word2idx"] = word2idx
        self.datasets["idx2word"] = idx2word

        self.embeddings = torch.from_numpy(np.asarray(embeddings, dtype=np.float32))

        if exec_type == 'train':
            self.main()
        else:
            model = load_torch_model(self.model_dir)
            test(model=model, dataset=self.datasets, test_set=None)
    def __init__(self, arguments):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e",
                            "--EPOCHS",
                            default=5,
                            type=int,
                            help="train epochs")
        parser.add_argument("-b",
                            "--BATCH",
                            default=2,
                            type=int,
                            help="batch size")
        self.args = parser.parse_args()
        self.arguments = arguments
        self.dataset = Dataset(epochs=self.args.EPOCHS,
                               batch=self.args.BATCH,
                               val_batch=self.args.BATCH)

        if 'bert' in self.arguments.model_name:
            self.tokenizer = Tokenizer4Bert(
                max_seq_len=self.arguments.max_seq_len,
                pretrained_bert_name=os.path.join(
                    os.getcwd(), self.arguments.pretrained_bert_name))
            bert = BertModel.from_pretrained(pretrained_model_name_or_path=self
                                             .arguments.pretrained_bert_name)
            self.model = self.arguments.model_class(bert, self.arguments).to(
                self.arguments.device)
        else:
            self.tokenizer = Util.bulid_tokenizer(
                fnames=[
                    self.arguments.dataset_file['train'],
                    self.arguments.dataset_file['test']
                ],
                max_seq_len=self.arguments.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset))
            embedding_matrix = Util.build_embedding_matrix(
                word2idx=self.tokenizer.word2idx,
                embed_dim=self.arguments.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(self.arguments.embed_dim), self.arguments.dataset))
            self.model = self.arguments.model_class(
                embedding_matrix, self.arguments).to(self.arguments.device)

        if self.arguments.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(
                    device=self.arguments.device.index)))

        Util.print_args(model=self.model, logger=logger, args=self.arguments)
    def generate(self):
        self.data = Dataset(epochs=self.args.EPOCHS,
                            batch=self.args.BATCH,
                            val_batch=self.args.BATCH)
        audio_paths, labels, _, _ = self.data.get_all_data()

        # wav文件路径
        audio_paths = [i['audio_path'] for i in audio_paths]
        # wav文本数据 TODO:此处包含空格, 测试去掉空格能否提升模型性能
        audio_labels = []
        # wav文本拼音
        audio_pinyins = []
        for label in labels:
            label = label['label'].split(' ')
            audio_labels.append(''.join(label))
            audio_pinyins.append(' '.join([
                ' '.join([
                    ' '.join(j)
                    for j in pinyin(i, style=Style.TONE3, heteronym=False)
                ]) for i in label
            ]))

        # 构建字典
        for label in labels:
            self.sortedDict.append_tokens(label)
        self.sortedDict.dump_pkl()

        # 划分训练/验证
        audio_paths = np.asarray(audio_paths)
        audio_labels = np.asarray(audio_labels)
        audio_pinyins = np.asarray(audio_pinyins)

        index = [i for i in range(len(audio_paths))]
        np.random.shuffle(np.asarray(index))
        train_audio_paths, dev_audio_paths = audio_paths[
            index[0:int(len(index) *
                        0.9)]], audio_paths[index[int(len(index) * 0.9):]]
        train_labels, dev_labels = audio_labels[
            index[0:int(len(index) *
                        0.9)]], audio_labels[index[int(len(index) * 0.9):]]
        train_pinyins, dev_pinyins = audio_pinyins[
            index[0:int(len(index) *
                        0.9)]], audio_pinyins[index[int(len(index) * 0.9):]]

        return train_audio_paths.tolist(), train_labels.tolist(
        ), train_pinyins.tolist(), dev_audio_paths.tolist(), dev_labels.tolist(
        ), dev_pinyins.tolist()
    def __init__(self, arguments):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs")
        parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size")
        self.args = parser.parse_args()
        self.arguments = arguments
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)

        if 'bert' in self.arguments.model_name:
            self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len,
                                            pretrained_bert_name=os.path.join(os.getcwd(),
                                                                              self.arguments.pretrained_bert_name))
            bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name)
            self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device)
        else:
            self.tokenizer = Util.bulid_tokenizer(
                fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']],
                max_seq_len=self.arguments.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)
            )
            embedding_matrix = Util.build_embedding_matrix(
                word2idx=self.tokenizer.word2idx,
                embed_dim=self.arguments.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset)
            )
            self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device)

        if self.arguments.device.type == 'cuda':
            logger.info(
                'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index)))

        Util.print_args(model=self.model, logger=logger, args=self.arguments)

        target_text, stance, _, _ = self.dataset.get_all_data()
        target = np.asarray([i['TARGET'].lower() for i in target_text])
        text = np.asarray([i['TEXT'].lower() for i in target_text])
        stance = np.asarray([i['STANCE'] for i in stance])
        self.target_set = set()
        for tar in target:
            self.target_set.add(tar)
        text = PreProcessing(text).get_file_text()
        trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer)

        valset_len = int(len(trainset) * self.arguments.valset_ratio)
        self.trainset, self.valset = random_split(trainset, (len(trainset) - valset_len, valset_len))
Example #9
0
    def generate(self):
        self.dataset = Dataset(epochs=self.arguments.EPOCHS,
                               batch=self.arguments.BATCH,
                               val_batch=self.arguments.BATCH)
        news, category, _, _ = self.dataset.get_all_data()
        news = np.asarray([i['news'] for i in news])
        category = np.asarray([i['category'] for i in category])

        index = [i for i in range(len(news))]
        np.random.shuffle(np.asarray(index))
        train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[
            index[int(len(index) * 0.9):]]
        train_category, dev_category = category[
            index[0:int(len(index) *
                        0.9)]], category[index[int(len(index) * 0.9):]]

        return train_news, train_category, dev_news, dev_category
    def __init__(self, exec_type="train"):
        parser = argparse.ArgumentParser()
        parser.add_argument("-e",
                            "--EPOCHS",
                            default=10,
                            type=int,
                            help="train epochs")
        parser.add_argument("-b",
                            "--BATCH",
                            default=24,
                            type=int,
                            help="batch size")
        args = parser.parse_args()

        self.batch_size = args.BATCH
        self.epochs = args.EPOCHS

        self.learning_rate = arguments.learning_rate
        self.embedding_size = arguments.embedding_size
        self.hidden_size = arguments.hidden_size
        self.tags = arguments.tags
        self.dropout = arguments.dropout
        self.tag_map = {label: i for i, label in enumerate(arguments.labels)}

        if exec_type == "train":
            self.model = Net(
                tag_map=self.tag_map,
                batch_size=self.batch_size,
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
        else:
            self.model = None

        self.dataset = Dataset(epochs=self.epochs, batch=self.batch_size)
Example #11
0
    def __init__(self, args):
        self.args = args

        self.dataset = Dataset(epochs=self.args.EPOCHS,
                               batch=self.args.BATCH,
                               val_batch=self.args.BATCH)
Example #12
0
        x_data = self.data.predict_data(**data)
        predict = self.model_predict(x_data)
        predict = self.data.to_categorys(predict)
        return predict

    def predict_all(self, datas):
        predicts = []
        y_pred_cls = self.outputParams['y_pred_cls']
        input_x = self.inputParams['input_x']
        keep_prob = self.inputParams['keep_prob']
        saver = tf.train.Saver()
        with tf.Session() as sess:
            saver.restore(sess, os.path.join(MODEL_PATH, TENSORFLOW_MODEL_DIR))
            for data in datas:
                x_data = self.data.predict_data(**data)
                predict = sess.run(y_pred_cls, feed_dict={input_x: x_data, keep_prob: 1.0})
                predict = self.data.to_categorys(predict)
                predicts.append(predict)

        return predicts

if __name__ == '__main__':

    # inputParams, outputParams, summaryParams = create_model(Processor().getWordsCount())
    # train_model(inputParams, outputParams, summaryParams,needInit=False)
    dataset = Dataset(train_batch=128, val_batch=64, split_ratio = 0.9,)
    model = Model(dataset)

    predic = model.predict(text="您好!我们这边是施华洛世奇鄞州万达店!您是我们尊贵的会员,特意邀请您参加我们x.x-x.x的三八女人节活动!满xxxx元享晶璨花漾丝巾")
    print('xxx')
Example #13
0
import os
from flyai.dataset import Dataset

from model import Model

data = Dataset(epochs=10, batch=32)
model = Model(data)
x_val, y_val = data.next_validation_batch()
#val = {y_val: x_val}
# x_train, y_train, x_test, y_test = data.next_batch(32)
# feed_dict={x: x_train}
print(x_val.shape)
#print(y_val)
print(x_val[0].shape)
val = {x_val[0]: '0'}
p = model.predict_all(val)
Example #14
0
        return labels

    def batch_iter(self, x, y, batch_size=128):
        """生成批次数据"""
        data_len = len(x)
        num_batch = int((data_len - 1) / batch_size) + 1

        indices = numpy.random.permutation(numpy.arange(data_len))
        x_shuffle = x[indices]
        y_shuffle = y[indices]

        for i in range(num_batch):
            start_id = i * batch_size
            end_id = min((i + 1) * batch_size, data_len)
            yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

    def save_model(self,
                   network,
                   path,
                   name=Torch_MODEL_NAME,
                   overwrite=False):
        super().save_model(network, path, name, overwrite)
        #torch.save(network, os.path.join(path, name))
        torch.save(network.state_dict(), os.path.join(path, name))


if __name__ == '__main__':
    from flyai.dataset import Dataset
    dataset = Dataset(epochs=1, batch=2)
    m = Model(dataset)
    print(m.predict_all([{'image_path': "img/381.tif"}]))
Example #15
0
 def get_dataset(cls,):
     if cls.dataset is not None:
         return cls.dataset
     else:
         cls.dataset = Dataset(epochs=3, batch=3, val_batch=3)
         return cls.dataset
Example #16
0
def eval_one_batch(preds):
    dataset = Dataset()
    model = Model(dataset)
    try:
        x_test, y_test = dataset.evaluate_data_no_processor("test.csv")
        print('eval.py use test.csv')
    except:
        x_test, y_test = dataset.evaluate_data_no_processor("dev.csv")
        print('eval.py use dev.csv')
    randnum = random.randint(0, 100)
    random.seed(randnum)
    random.shuffle(x_test)
    random.seed(randnum)
    random.shuffle(y_test)

    # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...]
    preds = model.predict_all(x_test)

    # 加载标签 [{'boxes':[], 'labels':[], 'image_id':[]}, ...]
    targets = []
    for i in range(len(y_test)):
        label_path = y_test[i]['label_path']  # label/019646.jpg.txt
        boxes = []
        labels = []
        image_id = []
        image_id.append(x_test[i]['image_path'])
        with open(os.path.join(DATA_PATH, label_path)) as f:
            for line in f.readlines():
                # 1954.7443195924375,695.1497671989313,1984.659514688955,738.4779589540301,1933
                temp = line.strip().split(',')
                xmin = int(float(temp[0]))
                ymin = int(float(temp[1]))
                xmax = int(float(temp[2]))
                ymax = int(float(temp[3]))
                boxes.append([xmin, ymin, xmax, ymax])
                labels.append(int(temp[4]))
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        targets.append(target)
    '''
    if不需要修改
    '''
    if len(y_test) != len(x_test):
        result = dict()
        result['score'] = 0
        result['label'] = "评估违规"
        result['info'] = ""
        print(json.dumps(result))
    else:
        '''
        在下面实现不同的评估算法
        '''
        # 开始计算最后得分
        sum_ap = 0
        all_labels = [i for i in range(2)]  # 所有目标类别

        # 以下是我自己加的

        for label in all_labels:  # 逐个类别计算ap
            prediction1 = []  # 在计算 ap 的时候,需要把prediction按照最后预测的类别进行筛选
            for pred in preds:
                if pred[3] == label:
                    prediction1.append([
                        pred[0], pred[1], pred[2][0], pred[2][1], pred[2][2],
                        pred[2][3]
                    ])
            if len(prediction1) != 0:  # 当包含预测框的时候,进行计算ap值
                rec, prec, ap = voc_eval(targets, prediction1, label)
            else:
                ap = 0
            sum_ap += ap
        map = sum_ap / len(all_labels)

        result = dict()
        result['score'] = round(map * 100, 2)
        result['label'] = "The Score is MAP."
        result['info'] = ""
        print(json.dumps(result))
        return map
Example #17
0
'''
实现模型的调用
'''
from flyai.dataset import Dataset

from model import Model

data = Dataset()
model = Model(data)
p = model.predict(
    text='gute lage im stadtzentrum. shoppingmeile und sehensw  rdigkeiten, sowie gute pubs in laufweite. das hotel ist neu, gut gepflegt und hat bem  htes nettes personal. ideal f  r einen kurztrip nach edinburgh. l  ngere aufenthalte eher nicht, da die zimmer recht klein sind.')
print(p)
常见问题请访问:https://www.flyai.com/question
意见和问题反馈有红包哦!添加客服微信:flyaixzs
'''
'''
项目的超参
'''
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=1, type=int, help="batch size")
args = parser.parse_args()
'''
flyai库中的提供的数据处理方法
传入整个数据训练多少轮,每批次批大小
'''
print('batch_size: %d, epoch_size: %d' % (args.BATCH, args.EPOCHS))
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH, val_batch=32)
model = Model(dataset)

print("number of train examples:%d" % dataset.get_train_length())
print("number of validation examples:%d" % dataset.get_validation_length())
# region 超参数
n_classes = 45
fc1_dim = 512
# endregion

# region 定义输入变量
x_inputs = tf.placeholder(shape=(None, 224, 224, 3),
                          dtype=tf.float32,
                          name='x_inputs')
y_inputs = tf.placeholder(shape=(None, n_classes),
                          dtype=tf.float32,
Example #19
0
# -*- coding: utf-8 -*
from flyai.dataset import Dataset
from model import Model
# from dataset import  Dataset

from processor import Processor

# dataset = Dataset(train_batch=128, val_batch=64, split_ratio=0.9, )
dataset = Dataset(batch=32, val_batch=32)
model = Model(dataset)
#
# predict = model.predict(text="您好!我们这边是施华洛世奇鄞州万达店!您是我们尊贵的会员,特意邀请您参加我们x.x-x.x的三八女人节活动!满xxxx元享晶璨花漾丝巾")
# print(predict)

labels = [
    0,
    0,
    0,
    0,
    0,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
Example #20
0
    def __init__(self, arguments):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e",
                            "--EPOCHS",
                            default=5,
                            type=int,
                            help="train epochs")
        parser.add_argument("-b",
                            "--BATCH",
                            default=4,
                            type=int,
                            help="batch size")
        self.args = parser.parse_args()
        self.arguments = arguments
        self.dataset = Dataset(epochs=self.args.EPOCHS,
                               batch=self.args.BATCH,
                               val_batch=self.args.BATCH)

        if 'bert' in self.arguments.model_name:
            self.tokenizer = Tokenizer4Bert(
                max_seq_len=self.arguments.max_seq_len,
                pretrained_bert_name=os.path.join(
                    os.getcwd(), self.arguments.pretrained_bert_name))
            bert = BertModel.from_pretrained(pretrained_model_name_or_path=self
                                             .arguments.pretrained_bert_name)
            self.model = self.arguments.model_class(bert, self.arguments).to(
                self.arguments.device)
        else:
            self.tokenizer = Util.bulid_tokenizer(
                fnames=[
                    self.arguments.dataset_file['train'],
                    self.arguments.dataset_file['test']
                ],
                max_seq_len=self.arguments.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset))
            embedding_matrix = Util.build_embedding_matrix(
                word2idx=self.tokenizer.word2idx,
                embed_dim=self.arguments.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(self.arguments.embed_dim), self.arguments.dataset))
            self.model = self.arguments.model_class(
                embedding_matrix, self.arguments).to(self.arguments.device)

        if self.arguments.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(
                    device=self.arguments.device.index)))

        Util.print_args(model=self.model, logger=logger, args=self.arguments)

        target_text, stance, _, _ = self.dataset.get_all_data()
        target = np.asarray([i['TARGET'].lower() for i in target_text])
        text = np.asarray([i['TEXT'].lower() for i in target_text])
        stance = np.asarray([i['STANCE'] for i in stance])

        # ############################# 特征词库的方法 效果不好
        # train_data = pd.DataFrame(data=[stance, target, text]).T
        # train_data.columns = ['STANCE', 'TARGET', 'TEXT']
        # Util.calculate_word_count(train_data)
        # ############################# 特征词库的方法 效果不好

        self.target_set = set()
        for tar in target:
            self.target_set.add(tar)
        text = PreProcessing(text).get_file_text()

        # ############################# 同义词替换的方法 效果不好
        # self.synonyms = SynonymsReplacer()
        # text_add = []
        # for index in range(len(text)):
        #     text_add.append(self.synonyms.get_syno_sents_list(text[index]))
        # target = np.append(target, target)
        # text = np.append(text, np.asarray(text_add))
        # stance = np.append(stance, stance)
        # ############################# 同义词替换的方法 效果不好

        print('target.shape: {}, text.shape: {}, stance.shape: {}'.format(
            target.shape, text.shape, stance.shape))
        trainset = ABSADataset(data_type=None,
                               fname=(target, text, stance),
                               tokenizer=self.tokenizer)

        valset_len = int(len(trainset) * self.arguments.valset_ratio)
        self.trainset, self.valset = random_split(
            trainset, (len(trainset) - valset_len, valset_len))
Example #21
0
# author=yphacker

import argparse
import numpy as np
import tensorflow as tf
from flyai.dataset import Dataset
import config
from model import Model
from cnn_model import CNN

parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=8, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=32, type=int, help="batch size")
args = parser.parse_args()

dataset = Dataset(batch=args.BATCH, epochs=args.EPOCHS)
modelpp = Model(dataset)
model = CNN()

# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     train_writer = tf.summary.FileWriter(LOG_PATH, sess.graph)
#
#     # dataset.get_step() 获取数据的总迭代次数
#     for step in range(dataset.get_step()):
#         x_train, y_train = dataset.next_train_batch()
#         x_val, y_val = dataset.next_validation_batch()
#
#         fetches = [loss, accuracy, train_op]
#         feed_dict = {input_x: x_train, input_y: y_train, keep_prob: 0.5}
#         loss_, accuracy_, _ = sess.run(fetches, feed_dict=feed_dict)
Example #22
0
        return labels

    '''
    保存模型的方法
    '''

    def save_model(self, model, path, name=KERAS_MODEL_NAME, overwrite=False):
        super().save_model(model, path, name, overwrite)
        model.save(os.path.join(path, name))


if __name__ == '__main__':

    print('ojbk')
    dataset = Dataset(epochs=5, batch=16)
    model = Model(dataset)

    p = model.predict_all([])
    print(p)

    x,y = dataset.next_train_batch()
    a = {
        'images/00007635_001.png': 0,
        'images/00002573_000.png': 0,
        'images/00000368_005.png': 0,
    }
    a1 = {
        'images/00007635_001.png':0
    }
    aa = [
Example #23
0
 def __init__(self, path=MODEL_PATH, name='final.h5'):
     self.data = Dataset()
     from keras.utils import CustomObjectScope
     with CustomObjectScope({'AttLayer': AttLayer}):
         self.model = load_model(os.path.join(path, name))
Example #24
0
# -*- coding: utf-8 -*
'''
实现模型的调用
'''
from flyai.dataset import Dataset
from model import Model

data = Dataset()
model = Model(data)

dataset = Dataset()
x_test, y_test = dataset.evaluate_data_no_processor('dev.csv')
preds = model.predict_all(x_test)

Example #25
0
 def __init__(self):
     self.args = args
     self.dataset = Dataset(epochs=self.args.total_epochs,
                            batch=self.args.batch_size,
                            val_batch=self.args.batch_size)
Example #26
0
            else:
                ap = 0
            sum_ap += ap
        map = sum_ap / len(all_labels)

        result = dict()
        result['score'] = round(map * 100, 2)
        result['label'] = "The Score is MAP."
        result['info'] = ""
        print(json.dumps(result))
        return map


if __name__ == "__main__":

    dataset = Dataset()
    model = Model(dataset)
    try:
        x_test, y_test = dataset.evaluate_data_no_processor("test.csv")
        print('eval.py use test.csv')
    except:
        x_test, y_test = dataset.evaluate_data_no_processor("dev.csv")
        print('eval.py use dev.csv')
    randnum = random.randint(0, 100)
    random.seed(randnum)
    random.shuffle(x_test)
    random.seed(randnum)
    random.shuffle(y_test)

    # 通过模型得到预测的结果,格式为:[[<image id> <confidence> <left> <top> <right> <bottom>], ...]
    preds = model.predict_all(x_test)
Example #27
0
    参数为csv中作为输入x的一条数据,该方法会被dataset.next_train_batch()
    和dataset.next_validation_batch()多次调用。评估的时候会调用该方法做数据处理
    该方法字段与app.yaml中的input:->columns:对应
    '''

    def output_x(self, TARGET, TEXT):
        text2vec = self.input_x(TARGET, TEXT)
        return text2vec

    '''
    输出的结果,会被dataset.to_categorys(data)调用
    '''

    def output_y(self, data):
        index = np.argmax(data)
        return index


if __name__ == '__main__':
    from flyai.dataset import Dataset
    dataset = Dataset(10, 32)
    train_x, train_y, val_x, val_y = dataset.get_all_data()
    preTrainedEmbedding = PreTrainedEmbedding()
    contents = [x['TEXT'] for x in train_x]
    unfounds = []
    for words in contents:
        print(words)
        vector, unfound = preTrainedEmbedding.turnToVectors(words)
        unfounds.append(unfound)
    print("unfound probability is: %f", np.mean(unfounds))
Example #28
0
第一次使用请看项目中的:第一次使用请读我.html文件
常见问题请访问:https://www.flyai.com/question
意见和问题反馈有红包哦!添加客服微信:flyaixzs
'''
'''
项目中的超参
'''
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=1, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=64, type=int, help="batch size")
args = parser.parse_args()
'''
flyai库中的提供的数据处理方法
传入整个数据训练多少轮,每批次批大小
'''
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)
model = Model(dataset)

word_dict, word_dict_res = load_dict()
vocab_size = max(word_dict.values()) + 1

# 超参
embedding_dim = 64  # 嵌入层大小
dnn_dim = 128  # Dense层大小
max_seq_len = 128  # 最大句长
num_filters = 64  # 卷积核数目
kernel_size = 5  # 卷积核尺寸
learning_rate = 1e-3  # 学习率
numclass = 2  # 类别数

# 传值空间
def main():
    """
    项目的超参
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs")
    parser.add_argument("-b", "--BATCH", default=8, type=int, help="batch size")
    args = parser.parse_args()

    # ------------------判断CUDA模式----------------------
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'
    device = torch.device(device)

    # ------------------预处理数据----------------------
    dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)

    network = Net.from_pretrained(arguments.bert_model, num_tag=len(arguments.labels)).to(device)
    logger.info('\n预处理结束!!!\n')
    # ---------------------优化器-------------------------
    param_optimizer = list(network.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    t_total = int(dataset.get_train_length() / arguments.gradient_accumulation_steps / args.BATCH * args.EPOCHS)

    # ---------------------GPU半精度fp16-----------------------------
    if arguments.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=arguments.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if arguments.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=arguments.loss_scale)

    # ------------------------GPU单精度fp32---------------------------
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=arguments.learning_rate,
                             warmup=arguments.warmup_proportion,
                             t_total=t_total
                             )

    # ---------------------模型初始化----------------------
    if arguments.fp16:
        network.half()

    train_losses = []
    eval_losses = []
    train_accuracy = []
    eval_accuracy = []

    best_f1 = 0
    start = time.time()
    global_step = 0
    for e in range(args.EPOCHS):
        network.train()
        for step in range(dataset.get_step() // args.EPOCHS):
            x_train, y_train = dataset.next_train_batch()
            batch = create_batch_iter(mode='train', X=x_train, y=y_train).dataset.tensors
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, output_mask = batch
            bert_encode = network(input_ids, segment_ids, input_mask)
            train_loss = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)

            if arguments.gradient_accumulation_steps > 1:
                train_loss = train_loss / arguments.gradient_accumulation_steps

            if arguments.fp16:
                optimizer.backward(train_loss)
            else:
                train_loss.backward()

            if (step + 1) % arguments.gradient_accumulation_steps == 0:
                def warmup_linear(x, warmup=0.002):
                    if x < warmup:
                        return x / warmup
                    return 1.0 - x

                # modify learning rate with special warm up BERT uses
                lr_this_step = arguments.learning_rate * warmup_linear(global_step / t_total,
                                                                       arguments.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            predicts = network.predict(bert_encode, output_mask)
            label_ids = label_ids.view(1, -1)
            label_ids = label_ids[label_ids != -1]
            label_ids = label_ids.cpu()

            train_acc, f1 = network.acc_f1(predicts, label_ids)

        logger.info("\n train_acc: %f - train_loss: %f - f1: %f - using time: %f - step: %d \n" % (train_acc,
                                                                                                   train_loss.item(),
                                                                                                   f1,
                                                                                                   (
                                                                                                           time.time() - start),
                                                                                                   step))

        # -----------------------验证----------------------------
        network.eval()
        count = 0
        y_predicts, y_labels = [], []
        eval_loss, eval_acc, eval_f1 = 0, 0, 0
        with torch.no_grad():
            for step in range(dataset.get_step() // args.EPOCHS):
                x_val, y_val = dataset.next_validation_batch()
                batch = create_batch_iter(mode='dev', X=x_val, y=y_val).dataset.tensors
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, output_mask = batch
                bert_encode = network(input_ids, segment_ids, input_mask).cpu()
                eval_los = network.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask)
                eval_loss = eval_los + eval_loss
                count += 1
                predicts = network.predict(bert_encode, output_mask)
                y_predicts.append(predicts)

                label_ids = label_ids.view(1, -1)
                label_ids = label_ids[label_ids != -1]
                y_labels.append(label_ids)

            eval_predicted = torch.cat(y_predicts, dim=0).cpu()
            eval_labeled = torch.cat(y_labels, dim=0).cpu()
            print('eval:')
            print(eval_predicted.numpy().tolist())
            print(eval_labeled.numpy().tolist())

            eval_acc, eval_f1 = network.acc_f1(eval_predicted, eval_labeled)
            network.class_report(eval_predicted, eval_labeled)

            logger.info(
                '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
                % (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(network, arguments.output_dir)

            if e % 1 == 0:
                train_losses.append(train_loss.item())
                train_accuracy.append(train_acc)
                eval_losses.append(eval_loss.item() / count)
                eval_accuracy.append(eval_acc)