Esempio n. 1
0
    def __init__(self, debug=False):
        super(Config,
              self).__init__(os.path.basename(__file__).split(".")[0], debug)
        self.seed = 279
        self.set_seed(self.seed)

        # 需要并行时设置并行数
        self.nb_workers = 4

        # 读取、解析初始数据
        self.loader_cls = TrainLoader
        self.train_file = abspath("data/train.csv")
        self.valid_file = abspath("data/valid.csv")
        self.premise = "content"
        self.hypothesis = None
        self.shuffle = True
        self.header = 0
        self.sep = ","
        self.encoding = "utf-8"
        self.if_lower = True

        # subword长度、维度、transfer_path
        self.max_seq = 1024
        self.subword_embed_dim = 128
        self.fine_tune_embed = True
        self.truncate_method = "head_tail"

        # batch化相关
        self.sort_within_batch = False
        self.batch_size = 32

        # 模型结构
        self.rnn_hidden_size = 256
        self.rnn_hidden_layers = 1
        self.unit_linear_size = 128

        # 训练速率相关
        self.epochs = 50
        self.lr = 5e-5
        self.dropout = 0.5
        self.weight_decay = 5e-4
        self.warm_up_proportion = 0.1
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 5
        self.schedule_per_batches = 400

        # 训练时验证相关
        self.improve_require = 80000
        self.eval_per_batches = 400
        self.f1_average = "macro"

        # 待计算赋值的全局变量
        self.classes = None
        self.num_classes = None
        self.feature_cols = None
        self.num_labels = None

        if self.debug:
            self.debug_set()
Esempio n. 2
0
    def debug_set(self):
        self.epochs = 1
        self.batch_size = 2
        self.model_ckpt = os.path.join(self.model_dir, "debug.{}.ckpt")
        self.logger_file = os.path.join(self.logger_dir, "debug.{}.log")

        self.processed_train = abspath(f"data/debug.processed.train.csv")
        self.processed_valid = abspath(f"data/debug.processed.valid.csv")
        self.word_tokenizer_path = os.path.join(self.data_cache,
                                                "debug.word.tokenizer.pt")
        self.char_tokenizer_path = os.path.join(self.data_cache,
                                                "debug.char.tokenizer.pt")
        self.word_w2v = abspath("library/embed.30w.txt")
        # self.word_w2v = os.path.join(self.data_cache, "debug.word.w2v.txt")
        self.char_w2v = os.path.join(self.data_cache, "debug.char.w2v.txt")
        self.processed_np = os.path.join(self.data_cache,
                                         "debug.processed.npz")
        # self.transfer_path = "/Users/Vander/Code/pytorch_col/albert-base-chinese"
        self.transfer_path = "/Users/Vander/Code/pytorch_col/albert_chinese_base_hf"
Esempio n. 3
0
    def __init__(self, name, debug):
        self.name = name
        self.debug = debug

        # 训练设备设置
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.n_gpu = torch.cuda.device_count()

        self.data_cache = abspath(f"data/{name}")
        if not os.path.exists(self.data_cache):
            os.makedirs(self.data_cache)

        # pretreatment之后的csv文件,全局通用
        self.processed_train = abspath(f"data/processed.train.csv")
        self.processed_valid = abspath(f"data/processed.valid.csv")

        # 可能存在的迁移模型对应的地址
        self.transfer_path = "/data/wangqian/berts/albert_chinese_base_hf"

        # 分词器序列化,包含vocab以及分词方式
        self.word_tokenizer_path = os.path.join(self.data_cache,
                                                "word.tokenizer.pt")
        self.char_tokenizer_path = os.path.join(self.data_cache,
                                                "char.tokenizer.pt")

        # 外部训练的w2v文件,目前是使用gensim-KeyedVectors的格式,此处默认指向外部词向量,也可自己训练
        self.word_w2v = os.path.join(self.data_cache, "word.w2v.txt")
        self.char_w2v = os.path.join(self.data_cache, "char.w2v.txt")

        # 目前来看将data-frame数据numpy化进行存储效果好点:包括train_df/valid_df/word_embed_mat/char_embed_mat/columns
        self.processed_np = os.path.join(self.data_cache, "processed.npz")

        # 存储模型位置设置,使用时间戳进行模型的命名
        self.model_dir = abspath(f"checkpoints/{name}")
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        self.model_ckpt = os.path.join(self.model_dir, "{}.ckpt")
        self.max_backup = 3  # 最多的模型保存

        # 训练损失记录设置
        self.summary_dir = abspath(f"summary/{name}")
        if not os.path.exists(self.summary_dir):
            os.makedirs(self.summary_dir)

        # 日志记录
        self.logger_name = name
        self.logger_dir = abspath(f"log/{name}")
        if not os.path.exists(self.logger_dir):
            os.makedirs(self.logger_dir)
        self.logger_file = os.path.join(self.logger_dir, "{}.log")

        # restore模式
        self.restore_model = False
        self.default_scale = 1

        # 为debug-set预留字段
        self.epochs = None
        self.batch_size = None
Esempio n. 4
0
    def setUp(self):
        super(TestVectors, self).setUp()

        self.contents = [
            '这是一个很好的餐馆,菜很不好吃,我还想再去', '这是一个很差的餐馆,菜很不好吃,我不想再去',
            '这是一个很好的餐馆,菜很好吃,我还想再去', '这是一个很好的餐馆,只是菜很难吃,我还想再去',
            '这是一个很好的餐馆,只是菜很不好吃,我还想再去', '好吃的!黑胡椒牛肉粒真的是挺咸的', '不论是环境的宽敞度还是菜的味道上',
            '烤鸭皮酥脆入口即化,总体还可以', '烤鸭皮酥脆入口即化', '软炸鲜菇据说是他家的优秀美味担当', '环境挺好的,服务很到位',
            '肉松的味道都不错,量一般', '也不算便宜,不过吃着好吃', '高大上的餐厅,一级棒的环境',
            '比较硬,比较喜欢芝士和咖喱口味的', '有嚼劲,很入味宫廷豌豆黄造型可爱', '蔬菜中规中矩,解腻不错',
            '女友生日菜有点贵架势不错味道就这样', '相比其他兰州拉面粗旷的装饰风格,这家设计很小清新,座位宽敞,客人不多'
        ]

        self.self_train_w2v = abspath("tests/vocab/train.w2v.txt")
        self.contents = [list(content) for content in self.contents]
Esempio n. 5
0
    "environment_decoration": "y",
    "environment_noise": "y",
    "environment_space": "y",
    "environment_cleaness": "y",
    "dish_portion": "y",
    "dish_taste": "y",
    "dish_look": "y",
    "dish_recommendation": "y",
    "others_overall_experience": "y",
    "others_willing_to_consume_again": "y",
})
skip_header = True
delimiter = "\x01"

# 训练数据
train_file = abspath("data/sa.train.csv")
valid_file = abspath("data/sa.valid.csv")

# 模型参数数据、
resume = True
user_dict = abspath("library/user.dict")
vector_cache = abspath("library/")
min_freq = 1
use_pre_embed = False
pre_embeddings = abspath("library/embeddings.300w.txt")
extend_vocab = True
pre_vocab_size = 200000

model_file = abspath(f"checkpoints/{task_name}.model.ckpt")
field_file = abspath(f"checkpoints/{task_name}.field.ckpt")
summary_dir = abspath(f"summary/{task_name}/")
Esempio n. 6
0
    def __init__(self, debug=False):
        super(Config,
              self).__init__(os.path.basename(__file__).split(".")[0], debug)
        self.seed = 279
        self.set_seed(self.seed)

        # 需要并行时设置并行数
        self.nb_workers = 4

        # 读取、解析初始数据
        self.loader_cls = TrainLoader
        self.train_file = abspath("data/train.csv")
        self.valid_file = abspath("data/valid.csv")
        self.premise = "content"
        self.hypothesis = None
        self.shuffle = True
        self.header = 0
        self.sep = ","
        self.encoding = "utf-8"
        self.if_lower = True

        # 分词相关/向量化
        self.max_seq = 512
        self.user_dict = abspath("library/user.30w.dict")
        self.word_max_vocab = 60000
        self.word_unk_token = "<unk>"
        self.word_pad_token = "<pad>"
        self.truncate_method = "head"
        self.word_window = 8
        self.word_min_count = 1
        self.word_iterations = 40

        # 嵌入相关
        self.word_embed_dim = 200
        self.word_w2v = abspath("library/embed.30w.txt")

        # batch化相关
        self.sort_within_batch = False
        self.batch_size = 32

        # 模型结构
        self.rnn_hidden_size = 256
        self.rnn_hidden_layers = 1
        self.unit_linear_size = 128

        # 训练速率相关
        self.epochs = 50
        self.lr = 1e-4
        self.dropout = 0.5
        self.weight_decay = 5e-4
        self.warm_up_proportion = 0.1
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 5
        self.schedule_per_batches = 400

        # 训练时验证相关
        self.improve_require = 50000
        self.eval_per_batches = 400
        self.f1_average = "macro"

        # 待计算赋值的全局变量
        self.classes = None
        self.num_classes = None
        self.word_embed_matrix = None
        self.feature_cols = None
        self.num_labels = None
        self.word_vocab_size = None

        if self.debug:
            self.debug_set()
import pandas as pd
from utils.path_util import abspath

train_file = abspath("data/sa_train.csv")
valid_file = abspath("data/sa_valid.csv")
test_file = abspath("data/sa_test.csv")

train_df = pd.read_csv(train_file, header=0, sep="\t")
valid_df = pd.read_csv(valid_file, header=0, sep="\t")
test_df = pd.read_csv(test_file, header=0, sep="\t")

train_df.to_csv(abspath("data/sa.train.csv"), index=False, sep="\x01")
valid_df.to_csv(abspath("data/sa.valid.csv"), index=False, sep="\x01")
test_df.to_csv(abspath("data/sa.test.csv"), index=False, sep="\x01")
Esempio n. 8
0
    def __init__(self, debug=False):
        super(Config,
              self).__init__(os.path.basename(__file__).split(".")[0], debug)
        self.seed = 279
        self.set_seed(self.seed)

        # 需要并行时设置并行数
        self.nb_workers = 4

        # 读取、解析初始数据
        self.loader_cls = TrainLoader
        self.train_file = abspath("data/train.csv")
        self.valid_file = abspath("data/valid.csv")
        self.premise = "content"
        self.hypothesis = None
        self.shuffle = True
        self.header = 0
        self.sep = ","
        self.encoding = "utf-8"
        self.if_lower = True

        # 分词相关、向量化
        self.max_seq = 1000
        self.stop_dict = abspath("library/stop_symbols.dict")
        self.max_vocab = 20000
        self.pad_token = "<pad>"
        self.unk_token = "<unk>"
        self.tokenize_method = "char"
        self.truncate_method = "head"

        # 词嵌入相关
        self.w2v_path = os.path.join(self.data_cache, "w2v.txt")
        self.embed_dim = 128
        self.window = 10
        self.min_count = 1
        self.iterations = 20

        # batch化相关
        self.sort_within_batch = False
        self.batch_size = 64

        # 模型结构相关
        self.hidden_dim = 128
        self.bidirectional = True
        self.num_layers = 1
        self.linear_dim = 128

        # 训练速率相关
        self.epochs = 80
        self.lr = 8e-5
        self.dropout = 0.5
        self.weight_decay = 5e-4
        self.warm_up_proportion = 0.1
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 5
        self.schedule_per_batches = 200

        # 训练时验证相关
        self.improve_require = 50000
        self.eval_per_batches = 200
        self.f1_average = "macro"

        # 待计算赋值的全局变量
        self.classes = None
        self.num_classes = None
        self.num_labels = None
        self.embed_matrix = None

        if self.debug:
            self.debug_set()
Esempio n. 9
0
    def __init__(self, debug=False):
        super(Config,
              self).__init__(os.path.basename(__file__).split(".")[0], debug)
        self.seed = 279
        self.set_seed(self.seed)

        # 需要并行时设置并行数
        self.nb_workers = 4

        # 读取、解析初始数据
        self.loader_cls = TrainLoader
        self.train_file = abspath("data/train.csv")
        self.valid_file = abspath("data/valid.csv")
        self.premise = "content"
        self.hypothesis = None
        self.shuffle = True
        self.max_seq = 1000
        self.header = 0
        self.sep = ","
        self.encoding = "utf-8"
        self.if_lower = True

        # 分词、索引相关
        self.stop_dict = abspath("library/stop_symbols.dict")
        if debug:
            self.transfer_path = "/Users/Vander/Code/pytorch_col/albert-base-chinese"
        else:
            self.transfer_path = "/data/wangqian/berts/albert-base-chinese"
        self.fine_tune_embed = True
        self.truncate_method = "head"

        # 词嵌入相关
        self.embed_dim = 128

        # batch化相关
        self.sort_within_batch = False
        self.batch_size = 32

        # 模型结构相关
        self.hidden_dim = 256
        self.bidirectional = True
        self.num_layers = 1
        self.linear_dim = 256
        self.highway_layers = 3

        # 训练速率相关
        self.epochs = 80
        self.lr = 8e-5
        self.dropout = 0.5
        self.weight_decay = 5e-4
        self.warm_up_proportion = 0.1
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 5
        self.schedule_per_batches = 400

        # 训练时验证相关
        self.improve_require = 50000
        self.eval_per_batches = 400
        self.f1_average = "macro"

        # 待计算赋值的全局变量
        self.classes = None
        self.num_classes = None
        self.num_labels = None
        self.embed_matrix = None

        if self.debug:
            self.debug_set()