def __init__(self, max_words=1000, max_len=50, num_epochs=10, batch_size=128):
     self.simulacrum_name = os.getenv("SIMULACRUM_NAME")
     self.num_epochs = num_epochs
     self.batch_size = batch_size
     self.max_words = max_words
     self.max_len = max_len
     self.tok = Tokenizer(num_words=max_words)
     self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME"))
     self.model = self.architecture()
     self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
Exemple #2
0
def calcAllFirstReplyDetail(outputPath):
	dp = DataProcessor(VipLvPath, InfoPath)
	dp.preprocess(includeSystem=False, filterNotOnDuty=False, filterOffWork=False)

	with open(outputPath, 'wb+') as csvfile:
		writer = csv.writer(csvfile)
		writer.writerow(["客服ID", "玩家ID", "玩家VIP等级", "接收玩家消息时间", "玩家消息内容", "回复玩家消息时间", "回复玩家内容", "回复间隔"])
		details = dp.calcFirstReplyDetail()
		for output in details:
			if not output:
				continue
			writer.writerow(output)
Exemple #3
0
def calcFirstReplyData(outputPath, includeSystem=True):
	dp = DataProcessor(VipLvPath, InfoPath)
	dp.preprocess(includeSystem=includeSystem)

	with open(outputPath, 'wb+') as csvfile:
		writer = csv.writer(csvfile)
		writer.writerow(["等级", "玩家发送信息总量", "3分钟内响应总量", "3分钟内响应率", "平均响应时长"])
		for lv in sorted(dp.lvset):
			output = dp.calcLvData(lv)
			if not output:
				continue
			writer.writerow(output)
Exemple #4
0
def calcPidWithoutLv(outputPath):
	dp = DataProcessor(VipLvPath, InfoPath)
	dp.preprocess(includeSystem=False)

	pidset = set()
	for msg in dp.msglist:
		if msg.pid not in dp.pid2lv:
			pidset.add(msg.pid)

	with open(outputPath, 'wb+') as csvfile:
		writer = csv.writer(csvfile)
		writer.writerow(["缺等级信息玩家ID"])
		for pid in pidset:
			writer.writerow([pid, ])
 def __init__(self):
     # super().__init__()
     processor = DataProcessor(os.getenv("SIMULACRUM_NAME"))
     self.train_X, self.test_X, self.train_y, self.test_y = processor.process(
     )
     self.num_epoch = int(os.getenv("CLASSIFIER_NUM_EPOCH"))
     self.model = nn.Sequential(
         # nn.Embedding(200, 1),
         # nn.ReLU(),
         # nn.MaxPool1d(1),
         # nn.Flatten(),
         nn.Linear(200, 1))
     self.loss = nn.BCEWithLogitsLoss()
     self.optimizer = optim.SGD(self.model.parameters(),
                                lr=0.05,
                                momentum=0.9,
                                weight_decay=0.001)
class SimulacrumGenerator:

    def __init__(self, max_words=1000, max_len=50, num_epochs=10, batch_size=128):
        self.simulacrum_name = os.getenv("SIMULACRUM_NAME")
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.max_words = max_words
        self.max_len = max_len
        self.tok = Tokenizer(num_words=max_words)
        self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME"))
        self.model = self.architecture()
        self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

    def architecture(self):
        inputs = Input(name='inputs', shape=[self.max_len])
        layer = Embedding(self.max_words, self.max_len, input_length=self.max_len)(inputs)
        layer = LSTM(64)(layer)
        layer = Dense(self.max_len, name='out_layer')(layer)
        layer = Activation('relu')(layer)
        model = Model(inputs=inputs, outputs=layer)
        return model

    def architecture2(self):
        inputs = Input(name='inputs', batch_shape=(self.batch_size, self.max_len))
        layer = Embedding(self.max_words, self.max_len)(inputs)
        layer = GRU(1024, recurrent_initializer='glorot_uniform', stateful=True)(layer)
        layer = Dense(self.max_len, name='out_layer')(layer)
        # layer = Activation('relu')(layer)
        model = Model(inputs=inputs, outputs=layer)
        return model


    def tokenize_sentences(self, sentences):
        sequences = self.tok.texts_to_sequences(sentences)
        # sequences = []
        # for vector in self.tok.texts_to_sequences(sentences):
        #     sequences.append(np.interp(vector, (0, self.max_words), (0, 1)))
        return sequence.pad_sequences(sequences, maxlen=self.max_len)

    def detokenzie(self, vectors):
        return self.tok.sequences_to_texts((vectors*10000).astype("int"))
        # return self.tok.sequences_to_texts(np.interp(vectors, (0, 1), (0, self.max_words)).astype("int"))

    def create_inputs(self, sentences=None):
        if sentences is None:
            self.processor.extract()
            sentences = self.processor.received
        self.tok.fit_on_texts(sentences)
        # self.max_words = len(sentences)
        return self.tokenize_sentences(sentences)

    def generate(self, sentences=None):
        if sentences is None:
            inputs = self.create_inputs()
        else:
            inputs = self.create_inputs(sentences)
        return np.array(self.model.predict(inputs)), np.zeros(len(inputs))

    def train(self, callbacks=None):
        # cb = [EarlyStopping(monitor='val_loss', min_delta=0.0001)]
        cb=[]
        if callbacks is not None:
            cb.extend(callbacks)

        self.processor.extract()
        train_X = []
        train_y = []
        for pair in self.processor.pairs:
            train_X.append(self.processor.received[pair[1]])
            train_y.append(self.processor.sent[pair[0]])

        self.model.fit(self.create_inputs(train_X), self.create_inputs(train_y), epochs=self.num_epochs,
                       batch_size=self.batch_size, validation_split=0.2, callbacks=cb)


# generator = SimulacrumGenerator()
# outputs, y = generator.generate()
# print(outputs[0], generator.tokenize_sentences(generator.processor.received)[0])
# print(generator.detokenzie(outputs))
Exemple #7
0
class SimulacrumDiscriminator:
    def __init__(self,
                 max_words=1000,
                 max_len=50,
                 num_epochs=10,
                 batch_size=128):
        self.simulacrum_name = os.getenv("SIMULACRUM_NAME")
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.max_words = max_words
        self.max_len = max_len
        self.tok = Tokenizer(num_words=max_words)
        self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME"))
        self.model = self.architecture()
        self.model.compile(loss='binary_crossentropy',
                           optimizer=RMSprop(),
                           metrics=['accuracy'])

    def architecture(self):
        inputs = Input(name='inputs', shape=[self.max_len])
        layer = Embedding(self.max_words, 50,
                          input_length=self.max_len)(inputs)
        layer = LSTM(64)(layer)
        layer = Dense(256, name='FC1')(layer)
        layer = Activation('tanh')(layer)
        layer = Dropout(0.5)(layer)
        layer = Dense(1, name='out_layer')(layer)
        layer = Activation('relu')(layer)
        model = Model(inputs=inputs, outputs=layer)
        return model

    def tokenize_sentences(self, sentences):
        sequences = self.tok.texts_to_sequences(sentences)
        # sequences = []
        # for vector in self.tok.texts_to_sequences(sentences):
        #     sequences.append(np.interp(vector, (0, self.max_words), (0, 1)))
        return sequence.pad_sequences(sequences, maxlen=self.max_len)

    def train(self, train_X=None, train_y=None, callbacks=None):
        if train_X is None and train_y is None:
            self.train_X, self.test_X, self.train_y, self.test_y = self.processor.plain_label(
            )
            train_X, train_y = self.train_X, self.train_y
        self.tok.fit_on_texts(train_X)
        self.fit(self.tokenize_sentences(train_X),
                 train_y,
                 callbacks=callbacks)
        return self

    def fit(self, sequences_matrix, train_y, callbacks=None):
        cb = [EarlyStopping(monitor='val_loss', min_delta=0.0001)]
        if callbacks is not None:
            cb.extend(callbacks)

        self.model.fit(sequences_matrix,
                       train_y,
                       batch_size=self.batch_size,
                       epochs=self.num_epochs,
                       validation_split=0.2,
                       callbacks=cb)
        return self

    def evaluate(self, test_X=None, test_y=None):
        if (test_X is None and test_y is None):
            self.train_X, self.test_X, self.train_y, self.test_y = self.processor.plain_label(
            )
            test_X, test_y = self.test_X, self.test_y
        accr = self.model.evaluate(self.tokenize_sentences(test_X), test_y)
        print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(
            accr[0], accr[1]))

    def predict(self, sentences):
        return self.model.predict(self.tok.texts_to_sequences(sentences))

    def save(self,
             train_X=None,
             test_X=None,
             train_y=None,
             test_y=None,
             with_data=False):
        self.model.save(
            f"data/SimulacrumDiscriminator_{self.simulacrum_name}.model")
        settings = np.array(
            [self.num_epochs, self.batch_size, self.max_words, self.max_len])
        print("Saving Settings: ", settings)
        np.savetxt(f"data/SD_settings_{self.simulacrum_name}.csv",
                   settings,
                   delimiter=",")
        if with_data:
            if train_X is None and test_X is None and train_y is None and test_y is None:
                self.processor.cache_results(self.train_X, self.test_X,
                                             self.train_y, self.test_y)
            else:
                self.processor.cache_results(train_X, test_X, train_y, test_y)

    def load(self,
             simulacrum_name=os.getenv("SIMULACRUM_NAME"),
             with_data=False):
        if simulacrum_name is None:
            simulacrum_name = os.getenv("SIMULACRUM_NAME")
        self.model = load_model(
            f"data/SimulacrumDiscriminator_{simulacrum_name}.model")
        self.num_epochs, self.batch_size, self.max_words, self.max_len = np.loadtxt(
            f"data/SD_settings_{self.simulacrum_name}.csv", delimiter=",")
        self.simulacrum_name = simulacrum_name
        if with_data:
            self.train_X, self.test_X, self.train_y, self.test_y = self.processor.load_cache(
                simulacrum_name)
        return self
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        choices=["NLI_M", "QA_M", "NLI_B", "QA_B"],
                        help="The name of the task to train.")
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. \n"
        "This specifies the model architecture.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        required=True,
        help="Initial checkpoint (usually from a pre-trained BERT model).")

    ## Other parameters
    parser.add_argument("--eval_test",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    sentiment_label = {"-2": "未提及", "0": "中性", "1": "正面", "-1": "负面"}

    # prepare dataloaders
    processor = DataProcessor(args.data_dir, args.task_name, sentiment_label)

    label_list = processor.get_labels()

    tokenizer = tokenization.ch_Tokenizer(vocab_file=args.vocab_file,
                                          do_lower_case=args.do_lower_case)

    # training set
    def load_func(line, load_mode='train'):
        data = line.strip().split('\t')
        guid = "%s-%s" % (load_mode, data[0])
        text_a = tokenization.convert_to_unicode(data[2])
        text_b = tokenization.convert_to_unicode(data[3])
        label = tokenization.convert_to_unicode(data[1])
        example = InputExample(guid=guid,
                               text_a=text_a,
                               text_b=text_b,
                               label=label)
        feature = convert_example_to_feature(example, label_list,
                                             args.max_seq_length, tokenizer)
        return feature

    def load_func_train(line):
        result = load_func(line, load_mode='train')
        return result

    def load_func_test(line):
        result = load_func(line, load_mode='test')
        return result

    def batchify(batch):
        all_input_ids = torch.tensor([f.input_ids for f in batch],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in batch],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in batch],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in batch],
                                     dtype=torch.long)

        return [all_input_ids, all_input_mask, all_segment_ids, all_label_ids]

    train_path = processor.get_train_path()
    test_path = processor.get_test_path()

    train_data = tnt.dataset.ListDataset(train_path, load_func_train)

    num_train_steps = int(
        len(train_data) / args.train_batch_size * args.num_train_epochs)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=batchify)

    # model and optimizer
    model = BertForSequenceClassification(bert_config, len(label_list))
    if args.init_checkpoint is not None:
        model.bert.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    # train
    output_log_file = os.path.join(args.output_dir, "log.txt")
    print("output_log_file=", output_log_file)
    with open(output_log_file, "w") as writer:
        if args.eval_test:
            writer.write(
                "epoch\tglobal_step\tloss\ttest_loss\ttest_accuracy\n")
        else:
            writer.write("epoch\tglobal_step\tloss\n")

    global_step = 0
    epoch = 0
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        epoch += 1
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()  # We have accumulated enought gradients
                model.zero_grad()
                global_step += 1

        # eval_test
        if args.eval_test:
            test_dataset = tnt.dataset.ListDataset(test_path, load_func_test)
            test_dataloader = DataLoader(dataset=test_dataset,
                                         batch_size=args.eval_batch_size,
                                         collate_fn=batchify,
                                         shuffle=False)
            model.eval()
            test_loss, test_accuracy = 0, 0
            nb_test_steps, nb_test_examples = 0, 0
            with open(
                    os.path.join(args.output_dir,
                                 "test_ep_" + str(epoch) + ".txt"),
                    "w") as f_test:
                for input_ids, input_mask, segment_ids, label_ids in test_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_test_loss, logits = model(input_ids, segment_ids,
                                                      input_mask, label_ids)

                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    for output_i in range(len(outputs)):
                        f_test.write(str(outputs[output_i]))
                        for ou in logits[output_i]:
                            f_test.write(" " + str(ou))
                        f_test.write("\n")
                    tmp_test_accuracy = np.sum(outputs == label_ids)

                    test_loss += tmp_test_loss.mean().item()
                    test_accuracy += tmp_test_accuracy

                    nb_test_examples += input_ids.size(0)
                    nb_test_steps += 1

            test_loss = test_loss / nb_test_steps
            test_accuracy = test_accuracy / nb_test_examples

        result = collections.OrderedDict()
        if args.eval_test:
            result = {
                'epoch': epoch,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps,
                'test_loss': test_loss,
                'test_accuracy': test_accuracy
            }
        else:
            result = {
                'epoch': epoch,
                'global_step': global_step,
                'loss': tr_loss / nb_tr_steps
            }

        logger.info("***** Eval results *****")
        with open(output_log_file, "a+") as writer:
            for key in result.keys():
                logger.info("  %s = %s\n", key, str(result[key]))
                writer.write("%s\t" % (str(result[key])))
            writer.write("\n")