def __init__(self, debug=False): super(Config, self).__init__(os.path.basename(__file__).split(".")[0], debug) self.seed = 279 self.set_seed(self.seed) # 需要并行时设置并行数 self.nb_workers = 4 # 读取、解析初始数据 self.loader_cls = TrainLoader self.train_file = abspath("data/train.csv") self.valid_file = abspath("data/valid.csv") self.premise = "content" self.hypothesis = None self.shuffle = True self.header = 0 self.sep = "," self.encoding = "utf-8" self.if_lower = True # subword长度、维度、transfer_path self.max_seq = 1024 self.subword_embed_dim = 128 self.fine_tune_embed = True self.truncate_method = "head_tail" # batch化相关 self.sort_within_batch = False self.batch_size = 32 # 模型结构 self.rnn_hidden_size = 256 self.rnn_hidden_layers = 1 self.unit_linear_size = 128 # 训练速率相关 self.epochs = 50 self.lr = 5e-5 self.dropout = 0.5 self.weight_decay = 5e-4 self.warm_up_proportion = 0.1 self.adam_epsilon = 1e-8 self.max_grad_norm = 5 self.schedule_per_batches = 400 # 训练时验证相关 self.improve_require = 80000 self.eval_per_batches = 400 self.f1_average = "macro" # 待计算赋值的全局变量 self.classes = None self.num_classes = None self.feature_cols = None self.num_labels = None if self.debug: self.debug_set()
def debug_set(self): self.epochs = 1 self.batch_size = 2 self.model_ckpt = os.path.join(self.model_dir, "debug.{}.ckpt") self.logger_file = os.path.join(self.logger_dir, "debug.{}.log") self.processed_train = abspath(f"data/debug.processed.train.csv") self.processed_valid = abspath(f"data/debug.processed.valid.csv") self.word_tokenizer_path = os.path.join(self.data_cache, "debug.word.tokenizer.pt") self.char_tokenizer_path = os.path.join(self.data_cache, "debug.char.tokenizer.pt") self.word_w2v = abspath("library/embed.30w.txt") # self.word_w2v = os.path.join(self.data_cache, "debug.word.w2v.txt") self.char_w2v = os.path.join(self.data_cache, "debug.char.w2v.txt") self.processed_np = os.path.join(self.data_cache, "debug.processed.npz") # self.transfer_path = "/Users/Vander/Code/pytorch_col/albert-base-chinese" self.transfer_path = "/Users/Vander/Code/pytorch_col/albert_chinese_base_hf"
def __init__(self, name, debug): self.name = name self.debug = debug # 训练设备设置 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.n_gpu = torch.cuda.device_count() self.data_cache = abspath(f"data/{name}") if not os.path.exists(self.data_cache): os.makedirs(self.data_cache) # pretreatment之后的csv文件,全局通用 self.processed_train = abspath(f"data/processed.train.csv") self.processed_valid = abspath(f"data/processed.valid.csv") # 可能存在的迁移模型对应的地址 self.transfer_path = "/data/wangqian/berts/albert_chinese_base_hf" # 分词器序列化,包含vocab以及分词方式 self.word_tokenizer_path = os.path.join(self.data_cache, "word.tokenizer.pt") self.char_tokenizer_path = os.path.join(self.data_cache, "char.tokenizer.pt") # 外部训练的w2v文件,目前是使用gensim-KeyedVectors的格式,此处默认指向外部词向量,也可自己训练 self.word_w2v = os.path.join(self.data_cache, "word.w2v.txt") self.char_w2v = os.path.join(self.data_cache, "char.w2v.txt") # 目前来看将data-frame数据numpy化进行存储效果好点:包括train_df/valid_df/word_embed_mat/char_embed_mat/columns self.processed_np = os.path.join(self.data_cache, "processed.npz") # 存储模型位置设置,使用时间戳进行模型的命名 self.model_dir = abspath(f"checkpoints/{name}") if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.model_ckpt = os.path.join(self.model_dir, "{}.ckpt") self.max_backup = 3 # 最多的模型保存 # 训练损失记录设置 self.summary_dir = abspath(f"summary/{name}") if not os.path.exists(self.summary_dir): os.makedirs(self.summary_dir) # 日志记录 self.logger_name = name self.logger_dir = abspath(f"log/{name}") if not os.path.exists(self.logger_dir): os.makedirs(self.logger_dir) self.logger_file = os.path.join(self.logger_dir, "{}.log") # restore模式 self.restore_model = False self.default_scale = 1 # 为debug-set预留字段 self.epochs = None self.batch_size = None
def setUp(self): super(TestVectors, self).setUp() self.contents = [ '这是一个很好的餐馆,菜很不好吃,我还想再去', '这是一个很差的餐馆,菜很不好吃,我不想再去', '这是一个很好的餐馆,菜很好吃,我还想再去', '这是一个很好的餐馆,只是菜很难吃,我还想再去', '这是一个很好的餐馆,只是菜很不好吃,我还想再去', '好吃的!黑胡椒牛肉粒真的是挺咸的', '不论是环境的宽敞度还是菜的味道上', '烤鸭皮酥脆入口即化,总体还可以', '烤鸭皮酥脆入口即化', '软炸鲜菇据说是他家的优秀美味担当', '环境挺好的,服务很到位', '肉松的味道都不错,量一般', '也不算便宜,不过吃着好吃', '高大上的餐厅,一级棒的环境', '比较硬,比较喜欢芝士和咖喱口味的', '有嚼劲,很入味宫廷豌豆黄造型可爱', '蔬菜中规中矩,解腻不错', '女友生日菜有点贵架势不错味道就这样', '相比其他兰州拉面粗旷的装饰风格,这家设计很小清新,座位宽敞,客人不多' ] self.self_train_w2v = abspath("tests/vocab/train.w2v.txt") self.contents = [list(content) for content in self.contents]
"environment_decoration": "y", "environment_noise": "y", "environment_space": "y", "environment_cleaness": "y", "dish_portion": "y", "dish_taste": "y", "dish_look": "y", "dish_recommendation": "y", "others_overall_experience": "y", "others_willing_to_consume_again": "y", }) skip_header = True delimiter = "\x01" # 训练数据 train_file = abspath("data/sa.train.csv") valid_file = abspath("data/sa.valid.csv") # 模型参数数据、 resume = True user_dict = abspath("library/user.dict") vector_cache = abspath("library/") min_freq = 1 use_pre_embed = False pre_embeddings = abspath("library/embeddings.300w.txt") extend_vocab = True pre_vocab_size = 200000 model_file = abspath(f"checkpoints/{task_name}.model.ckpt") field_file = abspath(f"checkpoints/{task_name}.field.ckpt") summary_dir = abspath(f"summary/{task_name}/")
def __init__(self, debug=False): super(Config, self).__init__(os.path.basename(__file__).split(".")[0], debug) self.seed = 279 self.set_seed(self.seed) # 需要并行时设置并行数 self.nb_workers = 4 # 读取、解析初始数据 self.loader_cls = TrainLoader self.train_file = abspath("data/train.csv") self.valid_file = abspath("data/valid.csv") self.premise = "content" self.hypothesis = None self.shuffle = True self.header = 0 self.sep = "," self.encoding = "utf-8" self.if_lower = True # 分词相关/向量化 self.max_seq = 512 self.user_dict = abspath("library/user.30w.dict") self.word_max_vocab = 60000 self.word_unk_token = "<unk>" self.word_pad_token = "<pad>" self.truncate_method = "head" self.word_window = 8 self.word_min_count = 1 self.word_iterations = 40 # 嵌入相关 self.word_embed_dim = 200 self.word_w2v = abspath("library/embed.30w.txt") # batch化相关 self.sort_within_batch = False self.batch_size = 32 # 模型结构 self.rnn_hidden_size = 256 self.rnn_hidden_layers = 1 self.unit_linear_size = 128 # 训练速率相关 self.epochs = 50 self.lr = 1e-4 self.dropout = 0.5 self.weight_decay = 5e-4 self.warm_up_proportion = 0.1 self.adam_epsilon = 1e-8 self.max_grad_norm = 5 self.schedule_per_batches = 400 # 训练时验证相关 self.improve_require = 50000 self.eval_per_batches = 400 self.f1_average = "macro" # 待计算赋值的全局变量 self.classes = None self.num_classes = None self.word_embed_matrix = None self.feature_cols = None self.num_labels = None self.word_vocab_size = None if self.debug: self.debug_set()
import pandas as pd from utils.path_util import abspath train_file = abspath("data/sa_train.csv") valid_file = abspath("data/sa_valid.csv") test_file = abspath("data/sa_test.csv") train_df = pd.read_csv(train_file, header=0, sep="\t") valid_df = pd.read_csv(valid_file, header=0, sep="\t") test_df = pd.read_csv(test_file, header=0, sep="\t") train_df.to_csv(abspath("data/sa.train.csv"), index=False, sep="\x01") valid_df.to_csv(abspath("data/sa.valid.csv"), index=False, sep="\x01") test_df.to_csv(abspath("data/sa.test.csv"), index=False, sep="\x01")
def __init__(self, debug=False): super(Config, self).__init__(os.path.basename(__file__).split(".")[0], debug) self.seed = 279 self.set_seed(self.seed) # 需要并行时设置并行数 self.nb_workers = 4 # 读取、解析初始数据 self.loader_cls = TrainLoader self.train_file = abspath("data/train.csv") self.valid_file = abspath("data/valid.csv") self.premise = "content" self.hypothesis = None self.shuffle = True self.header = 0 self.sep = "," self.encoding = "utf-8" self.if_lower = True # 分词相关、向量化 self.max_seq = 1000 self.stop_dict = abspath("library/stop_symbols.dict") self.max_vocab = 20000 self.pad_token = "<pad>" self.unk_token = "<unk>" self.tokenize_method = "char" self.truncate_method = "head" # 词嵌入相关 self.w2v_path = os.path.join(self.data_cache, "w2v.txt") self.embed_dim = 128 self.window = 10 self.min_count = 1 self.iterations = 20 # batch化相关 self.sort_within_batch = False self.batch_size = 64 # 模型结构相关 self.hidden_dim = 128 self.bidirectional = True self.num_layers = 1 self.linear_dim = 128 # 训练速率相关 self.epochs = 80 self.lr = 8e-5 self.dropout = 0.5 self.weight_decay = 5e-4 self.warm_up_proportion = 0.1 self.adam_epsilon = 1e-8 self.max_grad_norm = 5 self.schedule_per_batches = 200 # 训练时验证相关 self.improve_require = 50000 self.eval_per_batches = 200 self.f1_average = "macro" # 待计算赋值的全局变量 self.classes = None self.num_classes = None self.num_labels = None self.embed_matrix = None if self.debug: self.debug_set()
def __init__(self, debug=False): super(Config, self).__init__(os.path.basename(__file__).split(".")[0], debug) self.seed = 279 self.set_seed(self.seed) # 需要并行时设置并行数 self.nb_workers = 4 # 读取、解析初始数据 self.loader_cls = TrainLoader self.train_file = abspath("data/train.csv") self.valid_file = abspath("data/valid.csv") self.premise = "content" self.hypothesis = None self.shuffle = True self.max_seq = 1000 self.header = 0 self.sep = "," self.encoding = "utf-8" self.if_lower = True # 分词、索引相关 self.stop_dict = abspath("library/stop_symbols.dict") if debug: self.transfer_path = "/Users/Vander/Code/pytorch_col/albert-base-chinese" else: self.transfer_path = "/data/wangqian/berts/albert-base-chinese" self.fine_tune_embed = True self.truncate_method = "head" # 词嵌入相关 self.embed_dim = 128 # batch化相关 self.sort_within_batch = False self.batch_size = 32 # 模型结构相关 self.hidden_dim = 256 self.bidirectional = True self.num_layers = 1 self.linear_dim = 256 self.highway_layers = 3 # 训练速率相关 self.epochs = 80 self.lr = 8e-5 self.dropout = 0.5 self.weight_decay = 5e-4 self.warm_up_proportion = 0.1 self.adam_epsilon = 1e-8 self.max_grad_norm = 5 self.schedule_per_batches = 400 # 训练时验证相关 self.improve_require = 50000 self.eval_per_batches = 400 self.f1_average = "macro" # 待计算赋值的全局变量 self.classes = None self.num_classes = None self.num_labels = None self.embed_matrix = None if self.debug: self.debug_set()