def __init__(self, path_to_glove='glove.6B.200d.txt',
                 embedding_dim=200, prep_Data_from = 'train', purpose='train_model'):
        """
        NOTE-
        Beware of NaNs, drop them beforehand

        Dataset is the Liar Dataset. The description of the data can be found here -
        "Liar, Liar Pants on Fire: A New Benchmark Dataset for Fake News Detection - https://arxiv.org/abs/1705.00648"
        Download the dataset from - https://github.com/Tariq60/LIAR-PLUS
        Find the Glove vectors at https://nlp.stanford.edu/projects/glove/ and download the 822MB one.
        It contains 50d,100d, 200d and 300d vectors.
        300d with 400K vocab takes around 1.5GB RAM, choose file according to your system.
        We have prepared test cases using the 200d vectors. 

        :param path_to_glove: path to the desired glove vector file. File would be a .txt file
        :param embedding_dim: The dimension of vector you are choosing.
        :param prep_Data_from: Chose file from which you wanna prep data. 
        :param purpose: This is only used by the test.py file. This parameter should not concern you. When making your dataloaders, DO NOT pass this parameter. 
        """
        assert prep_Data_from in ['train', 'test', 'val']
        assert purpose in ['train_model', 'test_class']
        
        if purpose == 'train_model':
            path_to_train = 'train2.tsv'
            path_to_val = 'val2.tsv'
            path_to_test = 'test2.tsv'
        else:
            path_to_train = 'sample_train.tsv'
            path_to_test = 'sample_test.tsv'
            path_to_val = 'sample_val.tsv'

        train_Dataframe = pandas.read_csv(path_to_train, sep='\t', header=None).dropna()
        test_Dataframe = pandas.read_csv(path_to_test, sep='\t', header=None).dropna()
        val_Dataframe = pandas.read_csv(path_to_val, sep='\t', header=None).dropna()

        self.embeddings = create_glove_dict(path_to_glove)
        self.embedding_dim = embedding_dim
        self.dataframe = pandas.concat([train_Dataframe, test_Dataframe, val_Dataframe])

        self.justification_max = get_max_length(self.dataframe, 15)
        self.statement_max = get_max_length(self.dataframe, 3)

        if prep_Data_from == 'train':
            self.dataframe = train_Dataframe
        elif prep_Data_from == 'val':
            self.dataframe = val_Dataframe
        elif prep_Data_from == 'test':
            self.dataframe = test_Dataframe

        del train_Dataframe, test_Dataframe, val_Dataframe

        self.labels = {"true": 0,
                       "mostly-true": 1,
                       "half-true": 2,
                       "barely-true": 3,
                       "false": 4,
                       "pants-fire": 5}
        return None
Esempio n. 2
0
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim):
    word_sentences_train, label_sentences_train = load_conll_data(train_path)
    word_sentences_dev, label_sentences_dev = load_conll_data(dev_path)
    word_sentences_test, label_sentences_test = load_conll_data(test_path)
    max_length_train = utils.get_max_length(word_sentences_train)
    max_length_dev = utils.get_max_length(word_sentences_dev)
    max_length_test = utils.get_max_length(word_sentences_test)
    max_length = max(max_length_train, max_length_dev, max_length_test)
    label_sentences_id_train, alphabet_label = utils.map_string_2_id_open(
        label_sentences_train, 'pos')
    alphabet_label.save('pre-trained-model/pos', name='alphabet_label')
    label_sentences_id_dev = utils.map_string_2_id_close(
        label_sentences_dev, alphabet_label)
    label_sentences_id_test = utils.map_string_2_id_close(
        label_sentences_test, alphabet_label)
    word_train, label_train, mask_train = \
        utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_dev, label_dev, mask_dev = \
        utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_test, label_test, mask_test = \
        utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    alphabet_char = LabelEncoder('char')
    alphabet_char.get_index(word_end)
    index_sentences_train, max_char_length_train = utils.get_character_indexes(
        word_sentences_train, alphabet_char)
    alphabet_char.close()
    char_embedd_table = utils.build_char_embedd_table(char_embedd_dim,
                                                      alphabet_char)
    alphabet_char.save('pre-trained-model/pos', name='alphabet_char')
    index_sentences_dev, max_char_length_dev = utils.get_character_indexes(
        word_sentences_dev, alphabet_char)
    index_sentences_test, max_char_length_test = utils.get_character_indexes(
        word_sentences_test, alphabet_char)
    max_char_length = max(max_char_length_train, max_char_length_dev,
                          max_char_length_test)
    char_train = utils.construct_tensor_char(index_sentences_train, max_length,
                                             max_char_length, alphabet_char)
    char_dev = utils.construct_tensor_char(index_sentences_dev, max_length,
                                           max_char_length, alphabet_char)
    char_test = utils.construct_tensor_char(index_sentences_test, max_length,
                                            max_char_length, alphabet_char)
    num_labels = alphabet_label.size() - 1
    num_data = word_train.shape[0]
    return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \
           label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \
           char_embedd_table, num_labels, num_data
Esempio n. 3
0
def train():
    filename ='Flick_8k.trainImages.txt'
    train=utils.load_ids(filename)
    train_captions=utils.load_clean_captions('descriptions.txt',train)
    train_features=utils.load_photos_features('features.pkl',train)
    tokenizer = load(open('tokenizer.pkl','rb'))
    vocab_size = len(tokenizer.word_index)+1
    max_len = utils.get_max_length(train_captions)

    model = caption_model(vocab_size,max_len)
    epochs=20
    steps = len(train_captions)
Esempio n. 4
0
# read dev data
logger.info("Reading data from dev set...")
word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = dp.read_conll_sequence_labeling(
    dev_path, word_alphabet, label_alphabet, word_column, label_column)

# close alphabets : by close we mean we cannot add any more words to the word vocabulary.
#To DO :change to close this after train set alone
word_alphabet.close()
label_alphabet.close()

# we are doing a -1 because we did not use the zer index. I believe this is to account for unknown word
logger.info("word alphabet size: %d" % (word_alphabet.size() - 1))
logger.info("label alphabet size: %d" % (label_alphabet.size() - 1))
# get maximum length : this is mainly for padding.
max_length_train = utils.get_max_length(word_sentences_train)
max_length_dev = utils.get_max_length(word_sentences_dev)
#max_length_test = utils.get_max_length(word_sentences_test)
max_length = min(dp.MAX_LENGTH, max(max_length_train, max_length_dev))
logger.info("Maximum length (i.e max words ) of training set is %d" %
            max_length_train)
logger.info("Maximum length (i.e max words ) of dev set is %d" %
            max_length_dev)
#logger.info("Maximum length (i.e max words ) of test set is %d" % max_length_test)
logger.info("Maximum length (i.e max words ) used for training is %d" %
            max_length)

logger.info("Padding training text and lables ...")
word_index_sentences_train_pad, train_seq_length = utils.padSequence(
    word_index_sentences_train, max_length, beginZero=FLAGS.PadZeroBegin)
label_index_sentences_train_pad, _ = utils.padSequence(
Esempio n. 5
0
File: ner.py Progetto: Lanuet/NNVLP
def create_data_2_train(train_path, dev_path, test_path, char_embedd_dim):
    word_sentences_train, pos_sentences_train, chunk_sentences_train, label_sentences_train = load_conll_data(train_path)
    word_sentences_dev, pos_sentences_dev, chunk_sentences_dev, label_sentences_dev = load_conll_data(dev_path)
    word_sentences_test, pos_sentences_test, chunk_sentences_test, label_sentences_test = load_conll_data(test_path)
    max_length_train = utils.get_max_length(word_sentences_train)
    max_length_dev = utils.get_max_length(word_sentences_dev)
    max_length_test = utils.get_max_length(word_sentences_test)
    max_length = max(max_length_train, max_length_dev, max_length_test)

    pos_sentences_id_train, alphabet_pos = utils.map_string_2_id_open(pos_sentences_train, 'pos')
    alphabet_pos.save('pre-trained-model/ner', name='alphabet_pos')
    pos_sentences_id_dev = utils.map_string_2_id_close(pos_sentences_dev, alphabet_pos)
    pos_sentences_id_test = utils.map_string_2_id_close(pos_sentences_test, alphabet_pos)
    chunk_sentences_id_train, alphabet_chunk = utils.map_string_2_id_open(chunk_sentences_train, 'chunk')
    alphabet_chunk.save('pre-trained-model/ner', name='alphabet_chunk')
    chunk_sentences_id_dev = utils.map_string_2_id_close(chunk_sentences_dev, alphabet_chunk)
    chunk_sentences_id_test = utils.map_string_2_id_close(chunk_sentences_test, alphabet_chunk)
    label_sentences_id_train, alphabet_label = utils.map_string_2_id_open(label_sentences_train, 'ner')
    alphabet_label.save('pre-trained-model/ner', name='alphabet_label')
    label_sentences_id_dev = utils.map_string_2_id_close(label_sentences_dev, alphabet_label)
    label_sentences_id_test = utils.map_string_2_id_close(label_sentences_test, alphabet_label)

    word_train, label_train, mask_train = \
        utils.construct_tensor_word(word_sentences_train, label_sentences_id_train, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_dev, label_dev, mask_dev = \
        utils.construct_tensor_word(word_sentences_dev, label_sentences_id_dev, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)
    word_test, label_test, mask_test = \
        utils.construct_tensor_word(word_sentences_test, label_sentences_id_test, unknown_embedd, embedding_words,
                                    embedding_vectors, embedd_dim, max_length)

    pos_train = utils.construct_tensor_onehot(pos_sentences_id_train, max_length, alphabet_pos.size())
    pos_dev = utils.construct_tensor_onehot(pos_sentences_id_dev, max_length, alphabet_pos.size())
    pos_test = utils.construct_tensor_onehot(pos_sentences_id_test, max_length, alphabet_pos.size())
    chunk_train = utils.construct_tensor_onehot(chunk_sentences_id_train, max_length, alphabet_chunk.size())
    chunk_dev = utils.construct_tensor_onehot(chunk_sentences_id_dev, max_length, alphabet_chunk.size())
    chunk_test = utils.construct_tensor_onehot(chunk_sentences_id_test, max_length, alphabet_chunk.size())
    word_train = np.concatenate((word_train, pos_train), axis=2)
    word_train = np.concatenate((word_train, chunk_train), axis=2)
    word_dev = np.concatenate((word_dev, pos_dev), axis=2)
    word_dev = np.concatenate((word_dev, chunk_dev), axis=2)
    word_test = np.concatenate((word_test, pos_test), axis=2)
    word_test = np.concatenate((word_test, chunk_test), axis=2)
    alphabet_char = LabelEncoder('char')
    alphabet_char.get_index(word_end)
    index_sentences_train, max_char_length_train = utils.get_character_indexes(word_sentences_train, alphabet_char)
    alphabet_char.close()
    char_embedd_table = utils.build_char_embedd_table(char_embedd_dim, alphabet_char)
    alphabet_char.save('pre-trained-model/ner', name='alphabet_char')
    index_sentences_dev, max_char_length_dev = utils.get_character_indexes(word_sentences_dev, alphabet_char)
    index_sentences_test, max_char_length_test = utils.get_character_indexes(word_sentences_test, alphabet_char)
    max_char_length = max(max_char_length_train, max_char_length_dev, max_char_length_test)
    char_train = utils.construct_tensor_char(index_sentences_train, max_length, max_char_length, alphabet_char)
    char_dev = utils.construct_tensor_char(index_sentences_dev, max_length, max_char_length, alphabet_char)
    char_test = utils.construct_tensor_char(index_sentences_test, max_length, max_char_length, alphabet_char)
    num_labels = alphabet_label.size() - 1
    num_data, _, embedd_dim_concat = word_train.shape
    # print(np.shape(word_train))
    # print(np.shape(word_dev))
    # print(np.shape(word_test))
    # print(np.shape(char_train))
    # print(np.shape(char_dev))
    # print(np.shape(char_test))
    # print(np.shape(mask_train))
    # print(np.shape(mask_dev))
    # print(np.shape(mask_test))
    # print(np.shape(label_train))
    # print(np.shape(label_dev))
    # print(np.shape(label_test))
    # print(word_train[-1])
    np.save("word_train.npy",word_train)
    np.save("label_train.npy", label_train)
    print("Done")
    return word_train, word_dev, word_test, char_train, char_dev, char_test, mask_train, mask_dev, mask_test, \
           label_train, label_dev, label_test, alphabet_label, alphabet_char, max_length, max_char_length, \
           char_embedd_table, num_labels, num_data, embedd_dim_concat