Esempio n. 1
0
def load_dataset_sequence_labeling(train_path, dev_path, test_path, word_column=1, label_column=4,
                                   label_name='ner', oov='embedding', fine_tune=False, embedding="word2Vec",
                                   embedding_path=None,
                                   use_character=False):
    """
    load data from file
    :param train_path: path of training file
    :param dev_path: path of dev file
    :param test_path: path of test file
    :param word_column: the column index of word (start from 0)
    :param label_column: the column of label (start from 0)
    :param label_name: name of label, such as pos or ner
    :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and
                test data to alphabet; if "random", not.
    :param fine_tune: if fine tune word embeddings.
    :param embedding: embeddings for words, choose from ['word2vec', 'senna'].
    :param embedding_path: path of file storing word embeddings.
    :param use_character: if use character embeddings.
    :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test,
            embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table
    """

    def construct_tensor_fine_tune(word_index_sentences, label_index_sentences):
        X = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX)

        for i in range(len(word_index_sentences)):
            word_ids = word_index_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(word_ids)
            for j in range(length):
                wid = word_ids[j]
                label = label_ids[j]
                X[i, j] = wid
                Y[i, j] = label - 1

            # Zero out X after the end of the sequence
            X[i, length:] = 0
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1
        return X, Y, mask

    def generate_dataset_fine_tune():
        """
        generate data tensor when fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        print 'Embedd_dict'
        print np.shape(embedd_dict)
        logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless))
        # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length])
        X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train,
                                                                  label_index_sentences_train)
        X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev)
        X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test)
        C_train, C_dev, C_test, char_embedd_table, mask_c_train, mask_c_dev, mask_c_test = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)
        last_index_c_train, last_index_c_dev, last_index_c_test = generate_last_index(mask_c_train, mask_c_dev, mask_c_test)
        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless), label_alphabet, \
               C_train, C_dev, C_test, char_embedd_table

    def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, embedd_dict,
                                       embedd_dim, caseless):
        X = np.empty([len(word_sentences), max_length, embedd_dim], dtype=theano.config.floatX)
        Y = np.empty([len(word_sentences), max_length], dtype=np.int32)
        mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX)

        # bad_dict = dict()
        # bad_num = 0
        for i in range(len(word_sentences)):
            words = word_sentences[i]
            label_ids = label_index_sentences[i]
            length = len(words)
            for j in range(length):
                word = words[j].lower() if caseless else words[j]
                label = label_ids[j]
                embedd = embedd_dict[word] if word in embedd_dict else unknown_embedd
                X[i, j, :] = embedd
                Y[i, j] = label - 1

                # if word not in embedd_dict:
                #     bad_num += 1
                #     if word in bad_dict:
                #         bad_dict[word] += 1
                #     else:
                #         bad_dict[word] = 1

            # Zero out X after the end of the sequence
            X[i, length:] = np.zeros([1, embedd_dim], dtype=theano.config.floatX)
            # Copy the last label after the end of the sequence
            Y[i, length:] = Y[i, length - 1]
            # Make the mask for this sample 1 within the range of length
            mask[i, :length] = 1

        # for w, c in bad_dict.items():
        #     if c >= 100:
        #         print "%s: %d" % (w, c)
        # print bad_num

        return X, Y, mask

    def generate_dataset_not_fine_tune():
        """
        generate data tensor when not fine tuning
        :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size
        """

        embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet,
                                                                           logger)
        logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless))

        # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length])
        unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim])
        X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train,
                                                                      label_index_sentences_train, unknown_embedd,
                                                                      embedd_dict, embedd_dim, caseless)
        X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev,
                                                                unknown_embedd, embedd_dict, embedd_dim, caseless)
        X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test,
                                                                   unknown_embedd, embedd_dict, embedd_dim, caseless)
        C_train, C_dev, C_test, char_embedd_table, mask_c_train, mask_c_dev, mask_c_test = generate_character_data(word_sentences_train, word_sentences_dev,
                                                                            word_sentences_test,
                                                                            max_length) if use_character else (
            None, None, None, None)

        return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
               None, label_alphabet, C_train, C_dev, C_test, char_embedd_table

    word_alphabet = Alphabet('word')
    label_alphabet = Alphabet(label_name)

    # read training data
    logger.info("Reading data from training set...")
    word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling(
        train_path, word_alphabet, label_alphabet, word_column, label_column)
    #print word_sentences_train[0]
    #print word_index_sentences_train[0]
    #print label_index_sentences_train

    # if oov is "random" and do not fine tune, close word_alphabet
    if oov == "random" and not fine_tune:
        logger.info("Close word alphabet.")
        word_alphabet.close()

    # read dev data
    logger.info("Reading data from dev set...")
    word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling(
        dev_path, word_alphabet, label_alphabet, word_column, label_column)

    # read test data
    logger.info("Reading data from test set...")
    word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling(
        test_path, word_alphabet, label_alphabet, word_column, label_column)

    # close alphabets
    word_alphabet.close()
    label_alphabet.close()
    label_alphabet.save('tmp', 'label_alphabet')
    logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
    logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))

    # get maximum length
    max_length_train = get_max_length(word_sentences_train)
    max_length_dev = get_max_length(word_sentences_dev)
    max_length_test = get_max_length(word_sentences_test)
    max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test))
    logger.info("Maximum length of training set is %d" % max_length_train)
    logger.info("Maximum length of dev set is %d" % max_length_dev)
    logger.info("Maximum length of test set is %d" % max_length_test)
    logger.info("Maximum length used for training is %d" % max_length)

    if fine_tune:
        logger.info("Generating data with fine tuning...")
        return generate_dataset_fine_tune()
    else:
        logger.info("Generating data without fine tuning...")
        return generate_dataset_not_fine_tune()
Esempio n. 2
0
def create_alphabets(alphabet_directory,
                     data_paths,
                     max_vocabulary_size,
                     normalize_digits=True):
    logger = utils.get_logger("Create Alphabets")
    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')
    if not gfile.Exists(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)

        pos_alphabet.add(ROOT_POS)
        type_alphabet.add(ROOT_TYPE)

        pos_alphabet.add(PAD_POS)
        type_alphabet.add(PAD_TYPE)

        vocab = dict()
        for data_path in data_paths:
            logger.info("Processing data: %s" % data_path)
            with gfile.GFile(data_path, mode="r") as file:
                for line in file:
                    line = line.decode('utf-8')
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split()
                    word = DIGIT_RE.sub(
                        b"0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word in vocab:
                        vocab[word] += 1
                    else:
                        vocab[word] = 1

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
        logger.info("Type Alphabet Size: %d" % type_alphabet.size())

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]
        for word in vocab_list:
            word_alphabet.add(word)

        word_alphabet.save(alphabet_directory)
        pos_alphabet.save(alphabet_directory)
        type_alphabet.save(alphabet_directory)

    else:
        word_alphabet.load(alphabet_directory)
        pos_alphabet.load(alphabet_directory)
        type_alphabet.load(alphabet_directory)

    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()
    return word_alphabet, pos_alphabet, type_alphabet
Esempio n. 3
0
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30):
    """
    generate data for charaters
    :param sentences_train:
    :param sentences_dev:
    :param sentences_test:
    :param max_sent_length:
    :return: C_train, C_dev, C_test, char_embedd_table
    """

    def get_character_indexes(sentences):
        index_sentences = []
        max_length = 0
        for words in sentences:
            index_words = []
            for word in words:
                index_chars = []
                if len(word) > max_length:
                    max_length = len(word)

                for char in word[:MAX_CHAR_LENGTH]:
                    char_id = char_alphabet.get_index(char)
                    index_chars.append(char_id)

                index_words.append(index_chars)
            index_sentences.append(index_words)
        return index_sentences, max_length

    def construct_tensor_char(index_sentences):
        C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32)
        word_end_id = char_alphabet.get_index(word_end)
        mask_c = np.ones([len(index_sentences), max_sent_length, max_char_length], dtype=theano.config.floatX)

        for i in range(len(index_sentences)):
            words = index_sentences[i]
            sent_length = len(words)
            for j in range(sent_length):
                chars = words[j]
                char_length = len(chars)
                for k in range(char_length):
                    cid = chars[k]
                    C[i, j, k] = cid
                # fill index of word end after the end of word
                C[i, j, char_length:] = word_end_id
                mask_c[i, j, char_length:] = 0
            # Zero out C after the end of the sentence
            C[i, sent_length:, :] = 0
            mask_c[i, sent_length:, :] = 0
        return C, mask_c

    def build_char_embedd_table():
        scale = np.sqrt(3.0 / char_embedd_dim)
        char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype(
            theano.config.floatX)
        return char_embedd_table

    char_alphabet = Alphabet('character')
    char_alphabet.get_index(word_end)

    index_sentences_train, max_char_length_train = get_character_indexes(sentences_train)
    index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev)
    index_sentences_test, max_char_length_test = get_character_indexes(sentences_test)

    # close character alphabet
    char_alphabet.close()
    char_alphabet.save('tmp', 'char_alphabet')
    logger.info("character alphabet size: %d" % (char_alphabet.size() - 1))

    max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test))
    logger.info("Maximum character length of training set is %d" % max_char_length_train)
    logger.info("Maximum character length of dev set is %d" % max_char_length_dev)
    logger.info("Maximum character length of test set is %d" % max_char_length_test)
    logger.info("Maximum character length used for training is %d" % max_char_length)

    # fill character tensor
    C_train, mask_c_train = construct_tensor_char(index_sentences_train)
    C_dev, mask_c_dev = construct_tensor_char(index_sentences_dev)
    C_test, mask_c_test = construct_tensor_char(index_sentences_test)

    return C_train, C_dev, C_test, build_char_embedd_table(), mask_c_train, mask_c_dev, mask_c_test