def vectorize_data(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False):
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp.get_documents(filename):
            for seq in pp.get_sequences(doc):
                x = []
                x_char = []
                y = []
                for token in seq:
                    x.append(1 + token.word_index) # Add 1 to include token for padding
                    if return_chars:
                        x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                    if output_type == "category":
                        y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                    else:
                        y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                    y.append(y_idx) # Add 1 to include token for padding
                X.append(x)
                if return_chars:
                    padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                            pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                    X_char.append(padded_sequence)
                Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y
Example #2
0
def vectorize_data(filenames,
                   maxlen=2000,
                   max_charlen=20,
                   output_label_size=6,
                   output_label_dict=None,
                   output_type="hybrid",
                   return_chars=False):
    """
    Using histogram of document lengths 2000 is a reasonable number train on.
    """
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp.get_documents(filename):
            seq = pp.get_sequences(doc)
            x = []
            x_char = []
            y = []
            for token in seq:
                x.append(
                    1 + token.word_index)  # Add 1 to include token for padding
                if return_chars:
                    x_char.append(
                        (1 + np.array(token.char_seq)
                         ).tolist())  # Add 1 to include token for padding
                if output_type == "hybrid":
                    y_idx = 1 + output_label_dict.get(
                        "%s-%s" % (token.b_label, token.c_label),
                        -1)  # Add 1 to include token for padding
                elif output_type == "category":
                    y_idx = 1 + output_label_dict.get(
                        token.c_label,
                        -1)  # Add 1 to include token for padding
                else:
                    y_idx = 1 + output_label_dict.get(
                        token.b_label,
                        -1)  # Add 1 to include token for padding
                y.append(y_idx)  # Add 1 to include token for padding
            X.append(x)
            if return_chars:
                padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                        pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                X_char.append(padded_sequence)
            Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)

    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y
Example #3
0
def vectorize_data_old(filenames,
                       maxlen=100,
                       max_charlen=20,
                       output_label_size=6,
                       output_label_dict=None,
                       output_type="boundary",
                       return_chars=False):
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp_old.get_documents(filename):
            for seq in pp_old.get_sequences(doc):
                x = []
                x_char = []
                y = []
                for token in seq:
                    x.append(
                        1 +
                        token.word_index)  # Add 1 to include token for padding
                    if return_chars:
                        x_char.append(
                            (1 + np.array(token.char_seq)
                             ).tolist())  # Add 1 to include token for padding
                    if output_type == "category":
                        y_idx = 1 + output_label_dict.get(
                            token.c_label,
                            -1)  # Add 1 to include token for padding
                    else:
                        y_idx = 1 + output_label_dict.get(
                            token.b_label,
                            -1)  # Add 1 to include token for padding
                    y.append(y_idx)  # Add 1 to include token for padding
                X.append(x)
                if return_chars:
                    padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                            pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                    X_char.append(padded_sequence)
                Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)

    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y
Example #4
0
                for k in CONFIG["data_vectors"][::3]
            ]
            X_char_train, X_char_test = [
                np.load("%s/%s" % (BASE_DATA_DIR, k))
                for k in CONFIG["data_vectors"][1::3]
            ]
            logger.info("Loaded X_char_train: %s, X_char_test: %s" %
                        (X_char_train.shape, X_char_test.shape))
            X_char_train = X_char_train.reshape(
                (X_char_train.shape[0],
                 X_char_train.shape[1] * X_char_train.shape[2]))
            X_char_test = X_char_test.reshape(
                (X_char_test.shape[0],
                 X_char_test.shape[1] * X_char_test.shape[2]))
            Y_train = [
                vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1])
                for k in zip(CONFIG["data_vectors"][2:3], [hybrid_size])
            ]
            Y_test = [
                vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1])
                for k in zip(CONFIG["data_vectors"][5:6], [hybrid_size])
            ]
    if model_type == "brnn_cnn_multitask":
        logger.info(
            "Loaded data shapes:\nX_train: %s, X_char_train: %s, Y_train: %s\nX_test: %s, X_char_test: %s, Y_test: %s"
            % (X_train.shape, X_char_train.shape, [k.shape for k in Y_train],
               X_test.shape, X_char_test.shape, [k.shape for k in Y_train]))

    if model_type == "brnn_cnn_multitask":
        model, output_names, _temp_models = gen_model_brnn_cnn_multitask(
            vocab_size=vocab_size,
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), X_test)
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][4]), vtu.onehot_to_idxarr(Y_test[0]))
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][5]), vtu.onehot_to_idxarr(Y_test[1]))
     else:
         X_train, Y_train = vectorize_data(train_files, maxlen=maxlen, output_label_size=labels_size, output_label_dict=labels_dict, output_type=label_type)
         X_test, Y_test = vectorize_data(test_files, maxlen=maxlen, output_label_size=labels_size, output_label_dict=labels_dict, output_type=label_type)
         logger.info("Saving preprocessed vectors for faster computation next time in %s files." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]])
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][0]), X_train)
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][1]), vtu.onehot_to_idxarr(Y_train))
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][2]), X_test)
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), vtu.onehot_to_idxarr(Y_test))
 else:
     logger.info("Preprocessed vectors exist. Loading from files %s." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]])
     if model_type == "brnn_multitask":
         X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::3]]
         Y_train = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][1:3], [boundary_size, category_size])]
         Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][4:6], [boundary_size, category_size])]
     elif model_type == "brnn_cnn_multitask":
         X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::4]]
         X_char_train, X_char_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][1::4]]
         logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape))
         X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1]*X_char_train.shape[2]))
         X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1]*X_char_test.shape[2]))
         Y_train = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][2:4], [boundary_size, category_size])]
         Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][6:8], [boundary_size, category_size])]
     else:
         X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::2]]
         Y_train, Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k)), labels_size) for k in CONFIG["data_vectors"][1::2]]
 if model_type == "brnn_multitask":
     logger.info("Loaded data shapes:\nX_train: %s, Y_train: %s\nX_test: %s, Y_test: %s" % (X_train.shape, [k.shape for k in Y_train], X_test.shape, [k.shape for k in Y_train]))
 elif model_type == "brnn_cnn_multitask":
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), X_test)
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][4]), X_char_test)
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][5]), vtu.onehot_to_idxarr(Y_test[0]))
         # Reshape arrays after saving
         X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1]*X_char_train.shape[2]))
         X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1]*X_char_test.shape[2]))
         logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape))
 else:
     logger.info("Preprocessed vectors exist. Loading from files %s." % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]])
     if model_type == "brnn_cnn_multitask":
         X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::3]]
         X_char_train, X_char_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][1::3]]
         logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape))
         X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1]*X_char_train.shape[2]))
         X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1]*X_char_test.shape[2]))
         Y_train = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][2:3], [hybrid_size])]
         Y_test = [vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1]) for k in zip(CONFIG["data_vectors"][5:6], [hybrid_size])]
 if model_type == "brnn_cnn_multitask":
     logger.info("Loaded data shapes:\nX_train: %s, X_char_train: %s, Y_train: %s\nX_test: %s, X_char_test: %s, Y_test: %s" % (X_train.shape, X_char_train.shape, [k.shape for k in Y_train], X_test.shape, X_char_test.shape, [k.shape for k in Y_train]))
 
 if model_type == "brnn_cnn_multitask":
     model, output_names, _temp_models = gen_model_brnn_cnn_multitask(vocab_size=vocab_size, char_vocab_size = char_vocab_size, embedding_size=embedding_size, char_embedding_size = char_embedding_size, nb_filters = nb_filters, maxlen=maxlen, max_charlen=max_charlen, output_size=[hybrid_size], hidden_layer_size=hidden_layer_size, num_hidden_layers = num_hidden_layers, RNN_LAYER_TYPE=RNN_LAYER_TYPE)
     logger.error("Feature under development.")
 if weights_file is not None:
     logger.info("Loading model weights from %s. Will continue training model from %s epochs." % (weights_file, base_epochs))
     model.load_weights(weights_file)
 for epoch in xrange(base_epochs, n_epochs, save_every):
     logger.info("Starting Epochs %s to %s" % (epoch, epoch + save_every))
     start_time = time.time()
     if model_type == "brnn_cnn_multitask":
         model.fit({"input1": X_train, "input2": X_char_train, output_names[0]: Y_train[0]},\
             "Saving preprocessed vectors for faster computation next time in %s files."
             % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]]
         )
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][0]), X_train)
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][1]), vtu.onehot_to_idxarr(Y_train))
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][2]), X_test)
         np.save("%s/%s" % (BASE_DATA_DIR, CONFIG["data_vectors"][3]), vtu.onehot_to_idxarr(Y_test))
 else:
     logger.info(
         "Preprocessed vectors exist. Loading from files %s."
         % ["%s/%s" % (BASE_DATA_DIR, k) for k in CONFIG["data_vectors"]]
     )
     if model_type == "brnn_multitask":
         X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::3]]
         Y_train = [
             vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1])
             for k in zip(CONFIG["data_vectors"][1:3], [boundary_size, category_size])
         ]
         Y_test = [
             vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1])
             for k in zip(CONFIG["data_vectors"][4:6], [boundary_size, category_size])
         ]
     elif model_type == "brnn_cnn_multitask":
         X_train, X_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][::4]]
         X_char_train, X_char_test = [np.load("%s/%s" % (BASE_DATA_DIR, k)) for k in CONFIG["data_vectors"][1::4]]
         logger.info("Loaded X_char_train: %s, X_char_test: %s" % (X_char_train.shape, X_char_test.shape))
         X_char_train = X_char_train.reshape((X_char_train.shape[0], X_char_train.shape[1] * X_char_train.shape[2]))
         X_char_test = X_char_test.reshape((X_char_test.shape[0], X_char_test.shape[1] * X_char_test.shape[2]))
         Y_train = [
             vtu.to_onehot(np.load("%s/%s" % (BASE_DATA_DIR, k[0])), k[1])
             for k in zip(CONFIG["data_vectors"][2:4], [boundary_size, category_size])