#    print_iter_count += 1

            except KeyboardInterrupt, SystemExit:
                print ""
                print "########################################################"
                print "######  Pausing execution. Press ENTER to continue #####"
                print "########################################################"
                out = raw_input(
                    'Enter "pdb" to get prompt or ENTER to exit.> ')
                if out == "pdb":
                    pdb.set_trace()
            except Exception as e:
                print e
                print ">>>>> Is it intentional ?"

        progbar.end()
        if SAVE_MODEL_AFTER_EACH_EPOCH:
            model.save("model_trainable_%s_epoc_%d.h5" %
                       (str(TRAINABLE_EMBEDDINGS), epoch + 1))

        print ">> Epoch: %d/%d" % (epoch + 1, epochs)
        print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
        print('recall training = {}'.format(np.mean(mean_tr_rec)))
        print('loss training = {}'.format(np.mean(mean_tr_loss)))

        testing_on_data("Wikipedia(DEVELOPMENT)",
                        X_test,
                        Y_test,
                        model,
                        batch_size,
                        summary_only=True)
def get_input(sample_type,
              shuffle_documents,
              pad,
              trained_sent2vec_model=None):
    # Returns X, Y
    # X: Each row is a sample
    # Y: A 1-D vector for ground truth
    # Also pads the sample input as per the mentioned value of INPUT_VECTOR_LENGTH is needed

    start = time.time()
    data_handler = DataHandler()

    print "==========================================="
    if sample_type == 1:
        # NOT SURE ABOUT THIS TYPE!
        sample_type, samples = data_handler.get_samples(
        )  # Get samples, each sample is a set of INPUT_VECTOR_LENGTH consecutive sentences. No document information captured
    elif sample_type == 2:
        ld = load_data.LoadData()
        sample_type, samples = ld.load_wikipedia_sequence()
    elif sample_type in (2, 3):
        # type2 : Get samples, each sample is a document (a set of sentences resulting in a sequence), or, (NUM_DOCUMENTS, NUM_SENTENCES, SENTENCE)
        # type3 : Same as type2 just merge the samples to remove the sequence information and treat as simple sentence classification problem, i.e. (TOTAL_NUM_SENTENCES, SENTENCE)
        #         This processing will be done in the cnn_clssifier.py itself.
        sample_type, samples = data_handler.get_sequence_samples(sample_type)
        #sample_type, samples = data_handler.get_sequence_samples_PARALLEL()  # Get samples, each sample is a document (a set of sentences resulting in a sequence)
    elif sample_type == 4:
        # type4: Clinical sequence of a multiple samples
        # X.shape = (MULTIPLE_SAMPLES, TOTAL_SENTENCES)
        # Y.shape = (MULTIPLE_SAMPLES, TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_clinical_sequence()
    elif sample_type == 5:
        # type5: Biography sequence of a single sample
        # X.shape = (1, TOTAL_SENTENCES)
        # Y.shape = (TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_biography_sequence()
    elif sample_type == 6:
        # type6: Fiction sequence of a multiple documents
        # X.shape = (NO_OF_BOOKS, TOTAL_SENTENCES)
        # Y.shape = (NO_OF_BOOKS, TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_fiction_sequence()
    elif sample_type == 7:
        # type7: Wiki sequence of a multiple sample
        # Data format is just like the clinical sequence as each line is a sentence
        # X.shape = (MULTIPLE_DOCUMENTS, TOTAL_SENTENCES)
        # Y.shape = (MULTIPLE_DOCUMENTS, TOTAL_SENTENCES, 1)
        ld = load_data.LoadData()
        sample_type, samples = ld.load_wikipedia_sequence()
    else:
        print "NOTE: INVALID SAMPLE_TYPE!"
        return None

    del data_handler
    print "Samples Loading took", time.time() - start, "seconds"

    model = trained_sent2vec_model
    if not trained_sent2vec_model:
        #model = TFIDF(samples)
        #model = MeanWord2vec()
        #model = TFIDFweightedMeanWord2vec(samples)
        model = CustomSent2vec()

    X, Y = [], []
    _total_samples, _start_time = len(samples), time.time()
    print len(samples)
    #pdb.set_trace()
    for _idx, sample in enumerate(samples):
        # Each sample is a document
        # Each sample is a list of tuples with each tuple as (sentence, groundTruth)
        sentences, groundTruths = zip(*sample)  # Unpack a sample

        ## Create Wikipedia test set
        CREATE_WIKI_TEST_SET = False
        if CREATE_WIKI_TEST_SET:
            wiki_prefix = "wiki_save/wiki_test"
            if _idx >= 300:
                break
            with open(wiki_prefix + "_" + str(_idx + 1) + ".ref", "a") as f:
                for (_s, _g) in sample:
                    if _g:
                        f.write("==========\r\n")
                    f.write(_s + "\r\n")
                f.write("==========\r\n")
        else:
            # Traditional code
            if not _idx % 50:
                progbar.simple_update("Converting doc to martices",
                                      _idx + 1,
                                      _total_samples,
                                      time_elapsed=(time.time() - _start_time))

            if sample_type == 1:
                # Correct groundtruth sync problem here
                sentences, groundTruths = model.convert_sample_to_vec(
                    sentences, groundTruths)
            elif sample_type in (2, 3, 4, 5, 6, 7):
                sentences, groundTruths = model.convert_sequence_sample_to_vec(
                    sentences, groundTruths)
            else:
                print "Wrong Sample TYPE"

            if sentences is None:
                continue
            X.append(sentences)  # X[0].shape = matrix([[1,2,3,4.....]])
            Y.append(np.asarray(
                groundTruths))  # Y[0] = [1, 0, 0, ..... 0, 1, 0, 1....]
    progbar.simple_update("Creating a standalone matrix for samples...", -1,
                          -1)
    X, Y = np.asarray(X), np.asarray(Y)
    progbar.end()

    print "Total samples: %d" % (len(X))
    if shuffle_documents:  # Shuffle the X's and Y's if required
        # Both of them have to be in unison
        X, Y = unison_shuffled_copies(X, Y)
        print "SHUFFLE: Shuffled input document order! (X:", X.shape, ", Y:", Y.shape, ")"

    if sample_type == 2 and pad == False:
        print "NOTE: Sample type2 requires PADDING!"

    if pad:
        #### THIS PAD is messy!!!!
        ### Check once before padding
        if STATIC_PAD:
            max_len = AVERAGE_WORDS
        else:
            max_len = None  # Uses the max length of the sequences

        doc_lengths = [len(doc) for doc in X]
        print "Padding sequences. Doc-lengths: Mean=%d, Std=%d" % (
            np.mean(doc_lengths), np.std(doc_lengths))
        X = pad_sequences(X,
                          padding="post",
                          truncating="post",
                          value=0.0,
                          dtype=np.float32)
        Y = pad_sequences(Y,
                          padding="post",
                          truncating="post",
                          value=0.0,
                          dtype=np.float32)

        print "Size of new X(after padding):", X.shape

    return sample_type, X, Y, model
def custom_fit(X_train, Y_train, X_test, Y_test, model, batch_size, epochs=10):

    # Print Train stats
    total_sentences, total_documents = 0, 0
    total_documents = X_train.shape[0]
    total_sentences = sum([doc.shape[0] for doc in X_train])
    print "X-wiki TRAIN stats: Total %d sentences in %d documents" % (
        total_sentences, total_documents)

    class_weight = None
    if SCALE_LOSS_FUN:
        # Iterate as the no of sentences in each document is different
        # so np.unique() messes up.
        classes, counts = None, []
        for _temp_Yi in Y_train:
            classes, _temp_counts = np.unique(_temp_Yi, return_counts=True)
            counts.append(_temp_counts)
        counts = np.sum(counts, axis=0)
        class_weight = dict(zip(classes.tolist(), counts / float(sum(counts))))
        print class_weight

    train_avg_seg_len = np.mean(
        [helper.compute_avg_seg_len(Yi) for Yi in Y_train], axis=0)
    print ">> Train AVG_SEGMENT_LENGTH:", train_avg_seg_len

    print 'Train...'
    start_epoch = 0
    if LOAD_SAVED_MODEL_AND_CONTINUE_TRAIN:  # If we have saved model, then continue from the last epoch where we stopped
        start_epoch = saved_model_epoch_done  # The epoch count is zero indexed in TRAIN, while the count in saved file is 1 indexed

    for epoch in range(start_epoch, epochs):
        mean_tr_acc, mean_tr_loss, mean_tr_rec = [], [], []
        rLoss, rRecall, rAcc = 0, 0, 0  # Running parameters for printing while training
        for batch_count, (
                batch_X_left, batch_X_mid, batch_X_right,
                batch_Y_mid) in enumerate(
                    batch_gen_consecutive_context_segments_from_big_seq(
                        X_train, Y_train, batch_size, ONE_SIDE_CONTEXT_SIZE)):
            #batch_Y_vec = to_categorical_MULTI_DIM(batch_Y, nb_classes=2)
            try:
                start = time.time()
                tr_loss, tr_acc, tr_rec = model.train_on_batch(
                    [batch_X_left, batch_X_mid, batch_X_right], batch_Y_mid)
                speed = time.time() - start

                mean_tr_acc.append(tr_acc)
                mean_tr_loss.append(tr_loss)
                mean_tr_rec.append(tr_rec)
                #rLoss, rRecall, rAcc = (rLoss*batch_count + tr_loss)/(batch_count + 1), (rRecall*batch_count + tr_rec)/(batch_count + 1), (rAcc*batch_count + tr_acc)/(batch_count + 1)
                #progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'rLoss': rLoss, 'rAcc': rAcc, 'rRec': rRecall })
                progbar.prog_bar(True,
                                 total_sentences,
                                 epochs,
                                 batch_size,
                                 epoch,
                                 batch_count,
                                 speed=speed,
                                 data={
                                     'Loss': tr_loss,
                                     'Acc': tr_acc,
                                     'Rec': tr_rec
                                 })

                # Print test results after every 100 batch trains
                if (not batch_count % 100) and batch_count != 0:
                    testing_on_data("Wikipedia",
                                    X_test,
                                    Y_test,
                                    model,
                                    batch_size,
                                    summary_only=True)
                    testing_on_data("Clinical", X_cli, Y_cli, model,
                                    batch_size)
                    testing_on_data("Biography", X_bio, Y_bio, model,
                                    batch_size)
                    testing_on_data("Fiction",
                                    X_fic,
                                    Y_fic,
                                    model,
                                    batch_size,
                                    summary_only=True)

            except KeyboardInterrupt, SystemExit:
                print ""
                print "########################################################"
                print "######  Pausing execution. Press ENTER to continue #####"
                print "########################################################"
                out = raw_input(
                    'Enter "pdb" to get prompt or ENTER to exit.> ')
                if out == "pdb":
                    pdb.set_trace()

        progbar.end()
        if SAVE_MODEL_AFTER_EACH_EPOCH:
            model.save("model_trainable_%s_epoc_%d.h5" %
                       (str(TRAINABLE_EMBEDDINGS), epoch + 1))

        print ">> Epoch: %d/%d" % (epoch + 1, epochs)
        print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
        print('recall training = {}'.format(np.mean(mean_tr_rec)))
        print('loss training = {}'.format(np.mean(mean_tr_loss)))

        testing_on_data("Wikipedia",
                        X_test,
                        Y_test,
                        model,
                        batch_size,
                        summary_only=True)
        testing_on_data("Clinical", X_cli, Y_cli, model, batch_size)
        testing_on_data("Biography", X_bio, Y_bio, model, batch_size)
        testing_on_data("Fiction",
                        X_fic,
                        Y_fic,
                        model,
                        batch_size,
                        summary_only=True)

        print('___________________________________')
Ejemplo n.º 4
0
def custom_fit(X_train, Y_train, X_test, Y_test, model, batch_size, epochs=10):

    # Print Train stats
    total_sentences, total_documents = 0, 0
    total_documents = X_train.shape[0]
    total_sentences = sum([doc.shape[0] for doc in X_train])
    print "X-wiki TRAIN stats: Total %d sentences in %d documents" % (
        total_sentences, total_documents)

    class_weight = None
    if SCALE_LOSS_FUN:
        # Iterate as the no of sentences in each document is different
        # so np.unique() messes up.
        classes, counts = None, []
        for _temp_Yi in Y_train:
            classes, _temp_counts = np.unique(_temp_Yi, return_counts=True)
            counts.append(_temp_counts)
        counts = np.sum(counts, axis=0)
        class_weight = dict(zip(classes.tolist(), counts / float(sum(counts))))
        print class_weight

    train_avg_seg_len = np.mean(
        [helper.compute_avg_seg_len(Yi) for Yi in Y_train], axis=0)
    print ">> Train AVG_SEGMENT_LENGTH:", train_avg_seg_len

    print 'Train...'
    start_epoch = 0
    if LOAD_SAVED_MODEL_AND_CONTINUE_TRAIN:  # If we have saved model, then continue from the last epoch where we stopped
        start_epoch = saved_model_epoch_done  # The epoch count is zero indexed in TRAIN, while the count in saved file is 1 indexed

    print_iter_count = 0
    for epoch in range(start_epoch, epochs):
        mean_tr_acc, mean_tr_loss, mean_tr_rec = [], [], []
        batch_count = 0
        rLoss, rRecall, rAcc = 0, 0, 0  # Running parameters for printing while training
        for i in range(total_documents):
            X, Y = X_train[i], Y_train[i]
            for (batch_X, batch_Y) in batch_gen_sentences_without_context(
                    X, Y, batch_size, fixed_size=False):
                #pdb.set_trace()

                batch_Y = to_categorical(
                    batch_Y, nb_classes=2)  # Convert to output as 2 classes

                start = time.time()
                tr_loss, tr_acc, tr_rec = model.train_on_batch([batch_X],
                                                               batch_Y)
                speed = time.time() - start

                mean_tr_acc.append(tr_acc)
                mean_tr_loss.append(tr_loss)
                mean_tr_rec.append(tr_rec)
                #rLoss, rRecall, rAcc = (rLoss*batch_count + tr_loss)/(batch_count + 1), (rRecall*batch_count + tr_rec)/(batch_count + 1), (rAcc*batch_count + tr_acc)/(batch_count + 1)
                #progbar.prog_bar(True, total_sentences, epochs, batch_size, epoch, batch_count, speed=speed, data={ 'rLoss': rLoss, 'rAcc': rAcc, 'rRec': rRecall })
                progbar.prog_bar(True,
                                 total_sentences,
                                 epochs,
                                 batch_size,
                                 epoch,
                                 batch_count,
                                 speed=speed,
                                 data={
                                     'Loss': tr_loss,
                                     'Acc': tr_acc,
                                     'Rec': tr_rec
                                 })
                batch_count += 1

        progbar.end()
        if SAVE_MODEL_AFTER_EACH_EPOCH:
            model.save("model_trainable_%s_epoc_%d.h5" %
                       (str(TRAINABLE_EMBEDDINGS), epoch + 1))

        print ">> Epoch: %d/%d" % (epoch + 1, epochs)
        print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
        print('recall training = {}'.format(np.mean(mean_tr_rec)))
        print('loss training = {}'.format(np.mean(mean_tr_loss)))

        testing_on_data("Wikipedia(DEVELOPMENT)",
                        X_test,
                        Y_test,
                        model,
                        batch_size,
                        summary_only=True)
        testing_on_data("Clinical",
                        X_cli,
                        Y_cli,
                        model,
                        batch_size,
                        summary_only=True)
        #testing_on_data("Biography", X_bio, Y_bio, model, batch_size)
        testing_on_data("Fiction",
                        X_fic,
                        Y_fic,
                        model,
                        batch_size,
                        summary_only=True)
        testing_on_data("Wikipedia(BENCHMARK)",
                        X_wikitest,
                        Y_wikitest,
                        model,
                        batch_size,
                        summary_only=True)

        print('___________________________________')

    # Testing
    print "####################################################################"
    print ">> (TEST) >> Testing, X:", X_test.shape, "Y:", Y_test.shape
    mean_te_acc, mean_te_loss, mean_te_rec = [], [], []
    for i in range(X_test.shape[0]):
        X, Y = X_test[i], Y_test[i]
        for batch_X, batch_Y in batch_gen_sentences_without_context(
                X, Y, batch_size, fixed_size=False):
            te_loss, te_acc, te_rec = model.test_on_batch([batch_X], batch_Y)
            mean_te_acc.append(te_acc)
            mean_te_loss.append(te_loss)
            mean_te_rec.append(te_rec)

    print('accuracy testing = {}'.format(np.mean(mean_te_acc)))
    print('recall testing = {}'.format(np.mean(mean_te_rec)))
    print('loss testing = {}'.format(np.mean(mean_te_loss)))