def cnn_lstm_DA(args):
    logger_exp.info('-' * 50)
    logger_exp.info('Load data files..')
    # get prune dictionaries
    redundent_1, redundent_2 = tools.prune_data(args.train_file)
    # load training data
    train_examples, max_d, max_q, max_s = tools.load_jsondata(
        args.train_file, redundent_1, redundent_2, args.stopwords)
    # load development data
    dev_examples, a, b, c = tools.load_jsondata(args.dev_file, redundent_1,
                                                redundent_2, args.stopwords)

    num_train = len(train_examples[0])
    num_dev = len(dev_examples[0])
    logger_exp.info('-' * 50)
    logger_exp.info('Build dictionary..')
    word_dict = tools.build_dict(train_examples[0], train_examples[1])
    # entity dictionary for entire dataset
    entity_markers = list(
        set([w for w in word_dict.keys() if w.startswith('@ent')] +
            train_examples[2]))
    entity_markers = ['<unk_entity>'] + entity_markers
    entity_dict = {w: index for (index, w) in enumerate(entity_markers)}
    logger_exp.info('Entity markers: %d' % len(entity_dict))
    num_labels = len(entity_dict)

    logger_exp.info('-' * 50)
    # Load embedding file
    embeddings = tools.gen_embeddings(word_dict, args.embedding_size,
                                      args.embedding_file)

    (vocab_size, args.embedding_size) = embeddings.shape
    logger_exp.info('Building Model..')
    # build model
    if args.model_to_run == 'cnn_lstm_DA':
        cnn_model = CNN_LSTM_DA_Model(
            'CNN_LSTM_DA_Model',
            num_labels,
            vocab_size,
            args.embedding_size,
            max_d,
            max_q,
            max_s,
            nb_filters_utterance=args.utterance_filters,
            nb_filters_query=args.query_filters,
            learning_rate=args.learning_rate,
            dropout=args.dropout,
            nb_hidden_unit=args.hidden_size)
    if args.model_to_run == 'cnn_lstm':
        cnn_model = CNN_LSTM_Model('CNN_LSTM_Model',
                                   num_labels,
                                   vocab_size,
                                   args.embedding_size,
                                   max_d,
                                   max_q,
                                   max_s,
                                   nb_filters_utterance=args.utterance_filters,
                                   nb_filters_query=args.query_filters,
                                   learning_rate=args.learning_rate,
                                   dropout=args.dropout,
                                   nb_hidden_unit=args.hidden_size)

    cnn_model.load_embedding(np.array([embeddings]))
    if args.pre_trained is not None:
        cnn_model.load_weights(args.pre_trained)

    logger_exp.info('Done.')

    logger_exp.info('-' * 50)
    logger_exp.info(args)

    logger_exp.info('-' * 50)
    logger_exp.info('Intial test..')
    # vectorize development data
    dev_x1, dev_x2, dev_l, dev_y, dev_qmask, dev_dmask = tools.vectorize(
        dev_examples, word_dict, entity_dict, max_d, max_q, max_s)
    assert len(dev_x1) == num_dev

    all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, dev_qmask, dev_dmask,
                           args.batch_size)
    dev_acc = eval_acc(cnn_model, all_dev, max_d, max_q, max_s)
    logger_exp.info('Dev accuracy: %.2f %%' % dev_acc)
    best_acc = dev_acc

    if args.test_only:
        return
    cnn_model.save_model(args.save_model)

    # Training
    logger_exp.info('-' * 50)
    logger_exp.info('Start training..')

    # vectorize training data
    train_x1, train_x2, train_l, train_y, train_qmask, train_dmask = tools.vectorize(
        train_examples, word_dict, entity_dict, max_d, max_q, max_s)
    assert len(train_x1) == num_train

    train_x1, train_x2, train_l, train_y, train_qmask, train_dmask = pre_shuffle(
        train_x1, train_x2, train_l, train_y, train_qmask, train_dmask)
    start_time = time.time()
    n_updates = 0
    all_train = gen_examples(train_x1, train_x2, train_l, train_y, train_qmask,
                             train_dmask, args.batch_size)

    for epoch in range(args.nb_epoch):
        np.random.shuffle(all_train)

        for idx, (mb_x1, mb_x2, mb_l, mb_y, mb_qmask,
                  mb_dmask) in enumerate(all_train):
            logger_exp.info('#Examples = %d' % (len(mb_x1)))
            # rearrange each batch of dialogs
            newx1 = []
            for i in xrange(len(mb_x1[0])):
                newx1.append(np.array([scene[i] for scene in mb_x1]))

            hist = cnn_model.fit(newx1 + [np.array(mb_x2)] + [np.array(mb_l)] +
                                 [np.array(mb_qmask)] + [np.array(mb_dmask)],
                                 np.array(mb_y),
                                 batch_size=args.batch_size,
                                 verbose=0)
            logger_exp.info(
                'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)'
                % (epoch, idx, len(all_train), hist.history['loss'][0],
                   time.time() - start_time))
            n_updates += 1
            # evaluate every 100 batches
            if n_updates % 100 == 0:
                samples = sorted(
                    np.random.choice(num_train,
                                     min(num_train, num_dev),
                                     replace=False))
                sample_train = gen_examples(
                    [train_x1[k] for k in samples],
                    [train_x2[k] for k in samples], train_l[samples],
                    [train_y[k] for k in samples], train_qmask[samples],
                    train_dmask[samples], args.batch_size)

                logger_exp.info(
                    'Train accuracy: %.2f %%' %
                    eval_acc(cnn_model, sample_train, max_d, max_q, max_s))
                dev_acc = eval_acc(cnn_model, all_dev, max_d, max_q, max_s)
                logger_exp.info('Dev accuracy: %.2f %%' % dev_acc)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logger_exp.info(
                        'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%'
                        % (epoch, n_updates, dev_acc))
                    cnn_model.save_model(args.save_model)
def train_sentiment():
    #load data
    samples, labels, ID2label = load_training_data_sentiment(
        './data/manually_labeled_data_sentiment.txt')
    samples_val, labels_val = load_val_data_sentiment(
        './data/sentiment_test_data.txt')

    dict = tools.build_dict(samples, tools.MAX_NB_WORDS)  #bulid dict
    tools.save_dict(dict)  #save the dict to local

    #calculate weight for different to improve balance
    sentiment_weight = {}
    for i in range(2):
        sentiment_weight[i] = len(labels) / labels.count(i)

    print(len(dict))
    embedding_matrix, nb_words, EMBEDDING_DIM = tools.load_embedding(
        dict)  #load embedding
    N_label = len(ID2label)
    X, y = tools.normalize_training_data(samples, labels, N_label, dict,
                                         100)  #normalize the input data
    X_val, y_val = tools.normalize_training_data(samples_val, labels_val,
                                                 N_label, dict, 100)

    print(len(X))
    print(len(y))

    NUM = len(X)
    indices = np.arange(NUM)
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    samples = np.asarray(samples)
    samples = samples[indices]
    labels = np.asarray(labels)
    labels = labels[indices]
    training_ratio = 1  #setting the training data percentage
    N_train = int(NUM * training_ratio)
    X_train = X[:N_train]
    y_train = y[:N_train]
    #X_val = X[N_train:]
    #y_val = y[N_train:]
    #samples_val = samples[N_train:]
    #labels_val = labels[N_train:]
    sample_weights = np.ones(
        len(y_train))  #initialize the sample weight as all 1

    model = tools.define_model(tools.MAX_SEQUENCE_LENGTH, embedding_matrix,
                               nb_words, EMBEDDING_DIM, N_label)
    model_save_path = 'code\model_sentiment'  #save the best model
    model = tools.train_model(model, X_train, y_train, X_val, y_val,
                              sample_weights, model_save_path,
                              sentiment_weight)

    score, acc = model.evaluate(
        X_val, y_val, batch_size=2000)  #get the score and acc for the model

    print('Test score:', score)
    print('Test accuracy:', acc)

    pred = model.predict(
        X_val,
        batch_size=2000)  #get the concrete predicted value for each text
    labels_pred = tools.probs2label(
        pred)  #change the predicted value to labels
    #save the wrong result
    writer_sentiment = codecs.open(
        './data/wrong_analysis/sentiment_wrong_result.txt',
        "w",
        encoding='utf-8',
        errors='ignore')
    for i in range(len(labels_val)):
        if labels_val[i] != labels_pred[i]:
            writer_sentiment.write(samples_val[i] + '\t' +
                                   ID2label[labels_val[i]] + '\t' +
                                   ID2label[labels_pred[i]] + '\n')
    writer_sentiment.flush()
    writer_sentiment.close()
    return acc
def train_class():
    samples, labels, ID2label = load_training_data_class2(
        tools.PATH + '/data/class2_labels.txt', tools.PATH +
        '/data/manually_labeled_data_class2.txt')  #load class data
    dict = tools.build_dict(samples, tools.MAX_NB_WORDS)  #bulid dict
    print(len(dict))
    tools.save_dict(dict)  #save the dict to local
    embedding_matrix, nb_words, EMBEDDING_DIM = tools.load_embedding(
        dict)  #load embedding
    N_label = len(ID2label)
    X, y = tools.normalize_training_data(samples, labels, N_label, dict,
                                         100)  #normalize the input data
    print(len(X))
    print(len(y))

    NUM = len(X)
    indices = np.arange(NUM)
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    samples = np.asarray(samples)
    samples = samples[indices]
    labels = np.asarray(labels)
    labels = labels[indices]
    training_ratio = 0.9  #setting the training data percentage
    N_train = int(NUM * training_ratio)
    X_train = X[:N_train]
    y_train = y[:N_train]
    X_val = X[N_train:]
    y_val = y[N_train:]
    samples_val = samples[N_train:]
    labels_val = labels[N_train:]
    sample_weights = np.ones(
        len(y_train))  #initialize the sample weight as all 1

    model = tools.define_model(tools.MAX_SEQUENCE_LENGTH, embedding_matrix,
                               nb_words, EMBEDDING_DIM, N_label)
    model_save_path = 'code\model_class2'  #save the best model
    model = tools.train_model(model, X_train, y_train, X_val, y_val,
                              sample_weights, model_save_path)

    score, accuracy_class2 = model.evaluate(
        X_val, y_val, batch_size=2000)  #get the score and acc for the model
    print('Test score:', score)
    print('Test accuracy:', accuracy_class2)

    pred = model.predict(
        X_val,
        batch_size=2000)  #get the concrete predicted value for each text
    labels_pred = tools.probs2label(
        pred)  #change the predicted value to labels

    #save the wrong result for class2
    writer_class2 = codecs.open(tools.PATH +
                                '/data/wrong_analysis/class2_wrong_result.txt',
                                "w",
                                encoding='utf-8',
                                errors='ignore')
    for i in range(len(labels_val)):
        if labels_val[i] != labels_pred[i]:
            writer_class2.write(samples_val[i] + '\t' +
                                ID2label[labels_val[i]] + '\t' +
                                ID2label[labels_pred[i]] + '\n')
    writer_class2.flush()
    writer_class2.close()

    class2_class1 = load_class2_to_class1(
        tools.PATH + '/data/class2_class1.txt')  #merge the class2 to class1
    N_class1_true = 0
    worng_class = []
    for i in range(len(labels_val)):
        if class2_class1[ID2label[labels_val[i]]] == class2_class1[ID2label[
                labels_pred[i]]]:
            N_class1_true += 1
        else:
            worng_class.append(class2_class1[ID2label[labels_val[i]]] + "\t" +
                               class2_class1[ID2label[labels_pred[i]]] + "\t" +
                               samples_val[i])

    #save the wrong result for class1
    writer = codecs.open(tools.PATH +
                         '/data/wrong_analysis/class1_wrong_result.txt',
                         "w",
                         encoding='utf-8',
                         errors='ignore')
    writer.write("original_label" + "\t" + "predict_label" + "\t" + "sample" +
                 "\n")
    for item in worng_class:
        writer.write(item + '\n')
    writer.flush()
    writer.close()

    accuracy_class1 = N_class1_true / len(labels_val)
    print(accuracy_class1)
    return accuracy_class2, accuracy_class1