def __init__(self, max_words=1000, max_len=50, num_epochs=10, batch_size=128): self.simulacrum_name = os.getenv("SIMULACRUM_NAME") self.num_epochs = num_epochs self.batch_size = batch_size self.max_words = max_words self.max_len = max_len self.tok = Tokenizer(num_words=max_words) self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME")) self.model = self.architecture() self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
def calcAllFirstReplyDetail(outputPath): dp = DataProcessor(VipLvPath, InfoPath) dp.preprocess(includeSystem=False, filterNotOnDuty=False, filterOffWork=False) with open(outputPath, 'wb+') as csvfile: writer = csv.writer(csvfile) writer.writerow(["客服ID", "玩家ID", "玩家VIP等级", "接收玩家消息时间", "玩家消息内容", "回复玩家消息时间", "回复玩家内容", "回复间隔"]) details = dp.calcFirstReplyDetail() for output in details: if not output: continue writer.writerow(output)
def calcFirstReplyData(outputPath, includeSystem=True): dp = DataProcessor(VipLvPath, InfoPath) dp.preprocess(includeSystem=includeSystem) with open(outputPath, 'wb+') as csvfile: writer = csv.writer(csvfile) writer.writerow(["等级", "玩家发送信息总量", "3分钟内响应总量", "3分钟内响应率", "平均响应时长"]) for lv in sorted(dp.lvset): output = dp.calcLvData(lv) if not output: continue writer.writerow(output)
def calcPidWithoutLv(outputPath): dp = DataProcessor(VipLvPath, InfoPath) dp.preprocess(includeSystem=False) pidset = set() for msg in dp.msglist: if msg.pid not in dp.pid2lv: pidset.add(msg.pid) with open(outputPath, 'wb+') as csvfile: writer = csv.writer(csvfile) writer.writerow(["缺等级信息玩家ID"]) for pid in pidset: writer.writerow([pid, ])
def __init__(self): # super().__init__() processor = DataProcessor(os.getenv("SIMULACRUM_NAME")) self.train_X, self.test_X, self.train_y, self.test_y = processor.process( ) self.num_epoch = int(os.getenv("CLASSIFIER_NUM_EPOCH")) self.model = nn.Sequential( # nn.Embedding(200, 1), # nn.ReLU(), # nn.MaxPool1d(1), # nn.Flatten(), nn.Linear(200, 1)) self.loss = nn.BCEWithLogitsLoss() self.optimizer = optim.SGD(self.model.parameters(), lr=0.05, momentum=0.9, weight_decay=0.001)
class SimulacrumGenerator: def __init__(self, max_words=1000, max_len=50, num_epochs=10, batch_size=128): self.simulacrum_name = os.getenv("SIMULACRUM_NAME") self.num_epochs = num_epochs self.batch_size = batch_size self.max_words = max_words self.max_len = max_len self.tok = Tokenizer(num_words=max_words) self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME")) self.model = self.architecture() self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) def architecture(self): inputs = Input(name='inputs', shape=[self.max_len]) layer = Embedding(self.max_words, self.max_len, input_length=self.max_len)(inputs) layer = LSTM(64)(layer) layer = Dense(self.max_len, name='out_layer')(layer) layer = Activation('relu')(layer) model = Model(inputs=inputs, outputs=layer) return model def architecture2(self): inputs = Input(name='inputs', batch_shape=(self.batch_size, self.max_len)) layer = Embedding(self.max_words, self.max_len)(inputs) layer = GRU(1024, recurrent_initializer='glorot_uniform', stateful=True)(layer) layer = Dense(self.max_len, name='out_layer')(layer) # layer = Activation('relu')(layer) model = Model(inputs=inputs, outputs=layer) return model def tokenize_sentences(self, sentences): sequences = self.tok.texts_to_sequences(sentences) # sequences = [] # for vector in self.tok.texts_to_sequences(sentences): # sequences.append(np.interp(vector, (0, self.max_words), (0, 1))) return sequence.pad_sequences(sequences, maxlen=self.max_len) def detokenzie(self, vectors): return self.tok.sequences_to_texts((vectors*10000).astype("int")) # return self.tok.sequences_to_texts(np.interp(vectors, (0, 1), (0, self.max_words)).astype("int")) def create_inputs(self, sentences=None): if sentences is None: self.processor.extract() sentences = self.processor.received self.tok.fit_on_texts(sentences) # self.max_words = len(sentences) return self.tokenize_sentences(sentences) def generate(self, sentences=None): if sentences is None: inputs = self.create_inputs() else: inputs = self.create_inputs(sentences) return np.array(self.model.predict(inputs)), np.zeros(len(inputs)) def train(self, callbacks=None): # cb = [EarlyStopping(monitor='val_loss', min_delta=0.0001)] cb=[] if callbacks is not None: cb.extend(callbacks) self.processor.extract() train_X = [] train_y = [] for pair in self.processor.pairs: train_X.append(self.processor.received[pair[1]]) train_y.append(self.processor.sent[pair[0]]) self.model.fit(self.create_inputs(train_X), self.create_inputs(train_y), epochs=self.num_epochs, batch_size=self.batch_size, validation_split=0.2, callbacks=cb) # generator = SimulacrumGenerator() # outputs, y = generator.generate() # print(outputs[0], generator.tokenize_sentences(generator.processor.received)[0]) # print(generator.detokenzie(outputs))
class SimulacrumDiscriminator: def __init__(self, max_words=1000, max_len=50, num_epochs=10, batch_size=128): self.simulacrum_name = os.getenv("SIMULACRUM_NAME") self.num_epochs = num_epochs self.batch_size = batch_size self.max_words = max_words self.max_len = max_len self.tok = Tokenizer(num_words=max_words) self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME")) self.model = self.architecture() self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) def architecture(self): inputs = Input(name='inputs', shape=[self.max_len]) layer = Embedding(self.max_words, 50, input_length=self.max_len)(inputs) layer = LSTM(64)(layer) layer = Dense(256, name='FC1')(layer) layer = Activation('tanh')(layer) layer = Dropout(0.5)(layer) layer = Dense(1, name='out_layer')(layer) layer = Activation('relu')(layer) model = Model(inputs=inputs, outputs=layer) return model def tokenize_sentences(self, sentences): sequences = self.tok.texts_to_sequences(sentences) # sequences = [] # for vector in self.tok.texts_to_sequences(sentences): # sequences.append(np.interp(vector, (0, self.max_words), (0, 1))) return sequence.pad_sequences(sequences, maxlen=self.max_len) def train(self, train_X=None, train_y=None, callbacks=None): if train_X is None and train_y is None: self.train_X, self.test_X, self.train_y, self.test_y = self.processor.plain_label( ) train_X, train_y = self.train_X, self.train_y self.tok.fit_on_texts(train_X) self.fit(self.tokenize_sentences(train_X), train_y, callbacks=callbacks) return self def fit(self, sequences_matrix, train_y, callbacks=None): cb = [EarlyStopping(monitor='val_loss', min_delta=0.0001)] if callbacks is not None: cb.extend(callbacks) self.model.fit(sequences_matrix, train_y, batch_size=self.batch_size, epochs=self.num_epochs, validation_split=0.2, callbacks=cb) return self def evaluate(self, test_X=None, test_y=None): if (test_X is None and test_y is None): self.train_X, self.test_X, self.train_y, self.test_y = self.processor.plain_label( ) test_X, test_y = self.test_X, self.test_y accr = self.model.evaluate(self.tokenize_sentences(test_X), test_y) print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format( accr[0], accr[1])) def predict(self, sentences): return self.model.predict(self.tok.texts_to_sequences(sentences)) def save(self, train_X=None, test_X=None, train_y=None, test_y=None, with_data=False): self.model.save( f"data/SimulacrumDiscriminator_{self.simulacrum_name}.model") settings = np.array( [self.num_epochs, self.batch_size, self.max_words, self.max_len]) print("Saving Settings: ", settings) np.savetxt(f"data/SD_settings_{self.simulacrum_name}.csv", settings, delimiter=",") if with_data: if train_X is None and test_X is None and train_y is None and test_y is None: self.processor.cache_results(self.train_X, self.test_X, self.train_y, self.test_y) else: self.processor.cache_results(train_X, test_X, train_y, test_y) def load(self, simulacrum_name=os.getenv("SIMULACRUM_NAME"), with_data=False): if simulacrum_name is None: simulacrum_name = os.getenv("SIMULACRUM_NAME") self.model = load_model( f"data/SimulacrumDiscriminator_{simulacrum_name}.model") self.num_epochs, self.batch_size, self.max_words, self.max_len = np.loadtxt( f"data/SD_settings_{self.simulacrum_name}.csv", delimiter=",") self.simulacrum_name = simulacrum_name if with_data: self.train_X, self.test_X, self.train_y, self.test_y = self.processor.load_cache( simulacrum_name) return self
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=["NLI_M", "QA_M", "NLI_B", "QA_B"], help="The name of the task to train.") parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--init_checkpoint", default=None, type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--eval_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) sentiment_label = {"-2": "未提及", "0": "中性", "1": "正面", "-1": "负面"} # prepare dataloaders processor = DataProcessor(args.data_dir, args.task_name, sentiment_label) label_list = processor.get_labels() tokenizer = tokenization.ch_Tokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) # training set def load_func(line, load_mode='train'): data = line.strip().split('\t') guid = "%s-%s" % (load_mode, data[0]) text_a = tokenization.convert_to_unicode(data[2]) text_b = tokenization.convert_to_unicode(data[3]) label = tokenization.convert_to_unicode(data[1]) example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) feature = convert_example_to_feature(example, label_list, args.max_seq_length, tokenizer) return feature def load_func_train(line): result = load_func(line, load_mode='train') return result def load_func_test(line): result = load_func(line, load_mode='test') return result def batchify(batch): all_input_ids = torch.tensor([f.input_ids for f in batch], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in batch], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in batch], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in batch], dtype=torch.long) return [all_input_ids, all_input_mask, all_segment_ids, all_label_ids] train_path = processor.get_train_path() test_path = processor.get_test_path() train_data = tnt.dataset.ListDataset(train_path, load_func_train) num_train_steps = int( len(train_data) / args.train_batch_size * args.num_train_epochs) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=batchify) # model and optimizer model = BertForSequenceClassification(bert_config, len(label_list)) if args.init_checkpoint is not None: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) # train output_log_file = os.path.join(args.output_dir, "log.txt") print("output_log_file=", output_log_file) with open(output_log_file, "w") as writer: if args.eval_test: writer.write( "epoch\tglobal_step\tloss\ttest_loss\ttest_accuracy\n") else: writer.write("epoch\tglobal_step\tloss\n") global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 # eval_test if args.eval_test: test_dataset = tnt.dataset.ListDataset(test_path, load_func_test) test_dataloader = DataLoader(dataset=test_dataset, batch_size=args.eval_batch_size, collate_fn=batchify, shuffle=False) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 with open( os.path.join(args.output_dir, "test_ep_" + str(epoch) + ".txt"), "w") as f_test: for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_test.write(str(outputs[output_i])) for ou in logits[output_i]: f_test.write(" " + str(ou)) f_test.write("\n") tmp_test_accuracy = np.sum(outputs == label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = collections.OrderedDict() if args.eval_test: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'test_loss': test_loss, 'test_accuracy': test_accuracy } else: result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n")