Beispiel #1
0
    def sequence_vectorize(self, train_texts, val_texts):
        """Vectorizes texts as sequence vectors.
    
        1 text = 1 sequence vector with fixed length.
    
        # Arguments
            train_texts: list, training text strings.
            val_texts: list, validation text strings.
    
        # Returns
            x_train, x_val, word_index: vectorized training and validation
                texts and word index dictionary.
        """
        # Create vocabulary with training texts.
        tokenizer = text.Tokenizer(num_words=self.TOP_K)
        tokenizer.fit_on_texts(train_texts)

        # Vectorize training and validation texts.
        x_train = tokenizer.texts_to_sequences(train_texts)
        x_val = tokenizer.texts_to_sequences(val_texts)

        # Get max sequence length.
        max_length = len(max(x_train, key=len))
        if max_length > self.MAX_SEQUENCE_LENGTH:
            max_length = self.MAX_SEQUENCE_LENGTH

        # Fix sequence length to max value. Sequences shorter than the length are
        # padded in the beginning and sequences longer are truncated
        # at the beginning.
        x_train = sequence.pad_sequences(x_train, maxlen=max_length)
        x_val = sequence.pad_sequences(x_val, maxlen=max_length)
        return x_train, x_val, tokenizer.word_index
def build_embedding_matrix(x_train, x_test, maxlen, first_time, file_path):

    #Tokenizer
    if first_time:
        tokenizer = text.Tokenizer()
        tokenizer.fit_on_texts(x_train)
        dump(tokenizer, 'tokenizer.joblib', compress=3)
    else:
        tokenizer = load('tokenizer.joblib')

    #Word index
    word_index = tokenizer.word_index

    #Embedding matrix
    if first_time:
        print('Loading embedding index')
        embedding_index = load_embedding_index(file_path)
        print('Building our embedding matrix')
        embedding_matrix = np.zeros((len(word_index) + 1, 300))
        for word, i in word_index.items():
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        dump(embedding_matrix, 'embedding_matrix.joblib', compress=3)
    else:
        embedding_matrix = load('embedding_matrix.joblib')

    # Tokenzing + padding
    seq_x_train = sequence.pad_sequences(tokenizer.texts_to_sequences(x_train),
                                         maxlen=maxlen)

    seq_x_test = sequence.pad_sequences(tokenizer.texts_to_sequences(x_test),
                                        maxlen=maxlen)

    return seq_x_train, seq_x_test, embedding_matrix, word_index
Beispiel #3
0
def preprocess(data):
    tokenizer = text.Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(data)
    data_x = tokenizer.texts_to_sequences(data)
    data_x = sequence.pad_sequences(data_x, maxlen=max_length)
    data_x = np.array(data_x)
    return data_x
Beispiel #4
0
def preprocess(data_x, data_y, train_test_ratio=0.9):

    #Build vocabulary
    max_length = max([len(text.split(" ")) for text in data_x])

    vectorizer = CountVectorizer(lowercase=True, max_df=100)
    vectorizer.fit(data_x)
    vocab_size = len(vectorizer.vocabulary_)

    tokenizer = text.Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(data_x)
    data_x = tokenizer.texts_to_sequences(data_x)
    data_x = sequence.pad_sequences(data_x, maxlen=max_length)
    data_x = np.array(data_x)
    #vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
    #data_x          = np.array(list(vocab_processor.fit_transform(data_x)))
    data_y = np.array(data_y)

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(data_y.shape[0]))
    x_shuffled = data_x[shuffle_indices]
    y_shuffled = data_y[shuffle_indices]

    #Divide data
    data_length = len(x_shuffled)
    dividing_index = int(train_test_ratio * data_length)

    train_x = x_shuffled[:dividing_index]
    train_y = y_shuffled[:dividing_index]
    test_x = x_shuffled[dividing_index:]
    test_y = y_shuffled[dividing_index:]

    return train_x, train_y, test_x, test_y, vocab_size
Beispiel #5
0
def tokenize(sent_list):
    tokenizer = text.Tokenizer(filters='', oov_token='<unk>')
    tokenizer.fit_on_texts(sent_list)
    tensor_list = tokenizer.texts_to_sequences(sent_list)
    tensor_list = sequence.pad_sequences(tensor_list, padding='post')

    return {'Tensors': tensor_list, 'Tokenizer': tokenizer}
Beispiel #6
0
def get_tokenizer():
    with open(PATH + '/config/ruvec/vocab.txt') as f:
        words = [i[:-1] for i in f.readlines()]
    csv = read_csv(PATH + '/assets/csv/names.csv')
    train = csv['text']
    token = text.Tokenizer(num_words=None)
    token.fit_on_texts(list(train) + words)
    return token
Beispiel #7
0
    def _init_tokenizer(self):
        '''Initialise tokenizer for processing decay strings'''

        tokenize = text.Tokenizer(
            num_words=self.num_pdg_codes,
            filters='!"#$%&*+,./:;=?@[\]^_`{|}~'
        )
        tokenize.fit_on_texts(evtPdl.pdgTokens)
        return tokenize
Beispiel #8
0
 def create_tokenizer(self, text_list):
     """
 This class allows to vectorize a text corpus, by turning each text into either a sequence of 
 integers (each integer being the index of a token in a dictionary) or into a vector where the 
 coefficient for each token could be binary, based on word count, based on tf-idf.
 """
     tokenizer = text.Tokenizer(num_words=self._vocab_size)
     tokenizer.fit_on_texts(text_list)
     self._tokenizer = tokenizer
Beispiel #9
0
def sequence_vectorize(train_texts, val_texts):
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    max_len = len(max(x_train, key=len))
    if max_len > MAX_SEQUENCE_LENGTH:
        max_len = MAX_SEQUENCE_LENGTH
    
    x_train = sequence.pad_sequences(x_train, maxlen = max_len)
    x_val = sequence.pad_sequences(x_val, maxlen=max_len)
    return x_train, x_val, tokenizer.word_index
def create__inputs_outputs(train_dataframe, test_dataframe, num_of_tokenizer):
    number_of_words = num_of_tokenizer
    tokenizer = text.Tokenizer(num_words=number_of_words)
    tokenizer.fit_on_texts(train_dataframe['sentence'])

    train_tokens_left, train_tokens_right, train_tokens_all, test_tokens_left, test_tokens_right, test_tokens_all = create_sequences(
        tokenizer, train_dataframe, test_dataframe)

    labels_train = np.asanyarray(
        pd.get_dummies(train_dataframe['label'], prefix=['label']))
    labels_test = np.asanyarray(
        pd.get_dummies(test_dataframe['label'], prefix=['label']))

    return tokenizer, train_tokens_left, train_tokens_right, train_tokens_all, test_tokens_left, test_tokens_right, test_tokens_all, labels_train, labels_test
Beispiel #11
0
    def __init__(self, args):
        print('init processor')
        self._vocab_size = args.NB_WORDS
        self._max_sequence_length = args.MAX_SEQUENCE_LENGTH

        self._tokenizer = text.Tokenizer(num_words= args.NB_WORDS ,
                                        filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                                        lower=True, char_level=False) # save all vocabularies

        self._pad_sequences = pad_sequences
        self._re_sub = re.sub

        self._head_word=args.START_WORD
        self._end_word=args.END_WORD
Beispiel #12
0
def get_tokenizer_and_train(divided_code_path: str, save_tokenizer_path: str,
                            save_file_path: str, min_count: int,
                            max_len: int) -> None:
    with open(divided_code_path, "rb") as f:
        all_word = pickle.load(f)
    tokenizer = text.Tokenizer(filters="", lower=False, char_level=False)
    tokenizer.fit_on_texts(all_word)
    treated_str_train = []

    count = 0
    dir_file_list = []
    final_train = []
    for s in all_word:
        sentence = []
        for w in s:
            if tokenizer.word_counts[w] >= min_count:
                sentence.append(w)
            else:
                sentence.append("@OTHER")
        treated_str_train.append(sentence)
        dir_file_list.append(sentence)
        count += 1
        if count == 500:
            final_train.append(dir_file_list)
            dir_file_list = []
            count = 0
    save_tokenizer = text.Tokenizer(filters="", lower=False, char_level=False)
    save_tokenizer.fit_on_texts(treated_str_train)

    for i in range(len(final_train)):
        term1 = save_tokenizer.texts_to_sequences(final_train[i])
        final_train[i] = pad_sequences(term1, maxlen=max_len)
    with open(save_tokenizer_path, "wb") as f:
        pickle.dump(save_tokenizer, f)
    with open(save_file_path, "wb") as f:
        pickle.dump(final_train, f)
    return
Beispiel #13
0
def prepare_text_for_cbow(all_words):
    tokenizer = text.Tokenizer()
    tokenizer.fit_on_texts(all_words[0])
    word2id = tokenizer.word_index

    # build vocabulary of unique words
    word2id['PAD'] = 0
    id2word = {v: k for k, v in word2id.items()}
    wids = [word2id[w] for w in text.text_to_word_sequence(all_words[0][0])]

    vocab_size = len(word2id)
    embed_size = 10
    window_size = 2  # context window size

    print('Vocabulary Size:', vocab_size)
    print('Vocabulary Sample:', list(word2id.items())[:10])
    return wids, vocab_size, embed_size, window_size, id2word, word2id
    def fit(self, corpus):
        # load train and test data from the corpus
        train_data = corpus.train_data
        test_data = corpus.test_data

        # pre-processing by helper functions
        x_train, y_train = helper.get_text_and_labels(train_data)
        x_test, y_test = helper.get_text_and_labels(test_data)

        # tokenize, pad, convert to matrix
        tokenizer = text.Tokenizer(num_words=self.max_words, char_level=False)
        tokenizer.fit_on_texts(x_train)

        x_train = tokenizer.texts_to_matrix(x_train)
        x_test = tokenizer.texts_to_matrix(x_test)

        x_train = sequence.pad_sequences(x_train, maxlen=self.sequence_length)
        x_test = sequence.pad_sequences(x_test, maxlen=self.sequence_length)

        # one hot encode labels
        encoder = LabelEncoder()
        encoder.fit(y_train)

        # update class names
        self.class_names = encoder.classes_

        y_train_encoded = encoder.transform(y_train)
        y_test_encoded = encoder.transform(y_test)

        # update
        self.n_classes = np.max(y_train_encoded) + 1

        y_train_encoded = utils.to_categorical(y_train_encoded, self.n_classes)
        y_test_encoded = utils.to_categorical(y_test_encoded, self.n_classes)

        self.x_train = x_train
        self.x_test = x_test

        self.y_train = y_train_encoded
        self.y_test = y_test_encoded

        # update weights
        self.__load_weights(vocabulary=tokenizer.word_index)
    def __init__(self, train_texts, calculate_vocab):
        '''
        calculate_vocab: Boolean, if True we calculate the vocabulary from the dataset.
        '''

        #Get vocabulary.
        if calculate_vocab == False:
            with open("PretrainedEmbedding/vocab.txt") as f:
                self.word_index = {}
                i = 1
                for line in f:
                    (key, _) = line.split()
                    self.word_index[key] = i
                    i += 1
                    if i == MAX_FEATURES + 1:
                        break
        elif calculate_vocab == True:
            tokenizer = text.Tokenizer()
            tokenizer.fit_on_texts(train_texts)
            self.word_index = dict(
                itertools.islice(tokenizer.word_index.items(), MAX_FEATURES))
Beispiel #16
0
def get_divided_code_with_min_count(divided_code_path: str,
                                    save_tokenizer_path: str,
                                    save_file_path: str,
                                    min_count: int) -> None:
    # 首先所有的出现次数小于min_count的词替换成同一个词
    with open(divided_code_path, "rb") as f:
        all_word = pickle.load(f)
    tokenizer = text.Tokenizer(filters="", lower=False, char_level=False)
    tokenizer.fit_on_texts(all_word)
    # i = tokenizer.word_counts
    with open(save_tokenizer_path, "wb") as f:
        pickle.dump(tokenizer, f)
    with open(save_file_path, "w") as f:
        for s in all_word:
            for w in s:
                if tokenizer.word_counts[w] >= min_count:
                    f.write(w + " ")
                else:
                    f.write("@OTHER ")
            f.write("\n")
    return
Beispiel #17
0
def sequence_vectorize(train_texts, val_texts, test_texts, model_dir):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.
        test_texts: list, testing text strings.
        model_dir, string, locaction of vectorizer and selector will be saved.

    # Returns
        x_train, x_val, x_test, word_index: vectorized training, validation
            texts and testing texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)
    x_test = tokenizer.texts_to_sequences(test_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    x_test = sequence.pad_sequences(x_test, maxlen=max_length)

    pickle.dump(tokenizer, open(os.path.join(model_dir, 'tokenizer.pickle'), "wb"))
    pickle.dump(max_length, open(os.path.join(model_dir, 'max_length.pickle'), "wb"))

    return x_train, x_val, x_test, tokenizer.word_index
Beispiel #18
0
def nn_setup(x, y, max_length=450, tokenizer_name='nn_model.pkl'):  
	x_train, x_test, y_train, y_test = train_test_split(x,y)

	tokenizer = text.Tokenizer(num_words=28331)

	tokenizer.fit_on_texts(x_train)
	train_sequences = tokenizer.texts_to_sequences(x_train)
	test_sequences = tokenizer.texts_to_sequences(x_test)

	tokenizer_pkl = open(tokenizer_name, 'wb')
	pickle.dump(tokenizer, tokenizer_pkl)
	files.download(tokenizer_name)
	tokenizer_pkl.close()

	x_train_seq = sequence.pad_sequences(train_sequences, maxlen=max_length)
	x_test_seq = sequence.pad_sequences(test_sequences, maxlen=max_length)

	y_train_seq = to_categorical(y_train)
	y_test_seq = to_categorical(y_test)

	weights= compute_class_weight('balanced', np.unique(y_train), y_train)
	weights_dict = dict(zip( np.unique(y_train),weights))
	return tokenizer, x_train_seq, x_test_seq, y_train_seq, y_test_seq, weights_dict
def get_divided_code(train_path: str, map_save_path: str, train_save_path: str,
                     save_tokenizer_path: str, word_num: int) -> None:
    # 在代码中不同成分之间添加空格
    all_word = []
    all_word_in_one_line = []
    for root, dirs, files in os.walk(train_path):
        dir_test = []
        for name in files:
            with open(os.path.join(root, name), "r") as f:
                code = f.read()
            r = discrete_code(code)
            dir_test.append(r)
            all_word_in_one_line.extend(r)
        if dir_test:
            all_word.append(dir_test)
    tokenizer = text.Tokenizer(lower=False, char_level=False)
    tokenizer.fit_on_texts([all_word_in_one_line])
    with open(save_tokenizer_path, "wb") as f:
        pickle.dump(tokenizer, f)
    selected_word = sorted(tokenizer.word_counts.items(), key=lambda a: -a[1])
    selected_word = selected_word[:word_num - 1]
    word_map = dict()
    count = 0
    for w, _ in selected_word:
        word_map[w] = count
        count += 1
    with open(map_save_path, "wb") as f:
        pickle.dump(word_map, f)
    test_count = []
    for d in all_word:
        dir_test = []
        for s in d:
            dir_test.append(deal_with_s(word_map, s, word_num))
        test_count.append(dir_test)
    with open(train_save_path, "wb") as f:
        pickle.dump(test_count, f)
 def fit(self, instances):
     tokenizer = text.Tokenizer(lower=False, filters=[], oov_token=None)
     tokenizer.fit_on_texts(instances)
     self._tokenizer = tokenizer
     self.number_words = len(tokenizer.word_index)
     print(self.number_words)
Beispiel #21
0
# then fillall NaN values by emtpy string '' (redundant)
x_train = train['comment_text'].fillna('').values

# if true, y_train[i] =1, if false, it is = 0
y_train = np.where(train['target'] >= 0.5, 1, 0)

y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]

#
# Take the columns 'comment_text' from test,
# then fillall NaN values by emtpy string '' (redundant)
x_test = test['comment_text'].fillna('').values

# https://keras.io/preprocessing/text/
# tokenizer is a class with some method
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)

# we apply method fit_on_texts of tokenizer on x_train and x_test
# it will initialize some parameters/attribute inside tokenizer
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L139
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py#L210

tokenizer.fit_on_texts(list(x_train) + list(x_test))
# for example, after fit_on_texts, we can type
# tokenizer.word_counts #give a OderedDict
# tokenizer.document_counts # an int
# tokenizer.word_index is a dict of words with correponding indices
# There are 410046 different words in all 'comment_text'
# len(tokenizer.word_index) == 410_046

Beispiel #22
0
def generateOOVEmbeddings():
    # read the (DL cleaned) dataset and build the vocabulary
    print('loading dataframes...')
    train_df = pd.read_csv('../data/training/train2.cleaned.dl.csv')
    test_df = pd.read_csv('../data/eval/test2.cleaned.dl.csv')

    # ps: forget memory and runtime, it's python here :D
    list_sentences_train = train_df["comment_text"].values
    list_sentences_test = test_df["comment_text"].values
    list_sentences_all = np.concatenate([list_sentences_train, list_sentences_test])

    tokenizer = text.Tokenizer(num_words=400000)
    tokenizer.fit_on_texts(list(list_sentences_all))
    print('word_index size:', len(tokenizer.word_index), 'words')
    word_index = tokenizer.word_index

    # load fastText - only the words
    print('loading fastText embeddings...')
    voc = set()
    f = open('/mnt/data/wikipedia/embeddings/crawl-300d-2M.vec')
    begin = True
    for line in f:
        if begin:
            begin = False
        else: 
            values = line.split()
            word = ' '.join(values[:-300])
            voc.add(word)
    f.close()
    print('fastText embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('fastText embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-fastText.txt", "w") as oovFile:
        for w in oov:
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()

    # load gloves - only the words
    print('loading gloves embeddings...')
    voc = set()
    f = open('/mnt/data/wikipedia/embeddings/glove.840B.300d.txt')
    for line in f:
        values = line.split()
        word = ' '.join(values[:-300])
        voc.add(word)
    f.close()
    print('gloves embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('gloves embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-gloves.txt", "w") as oovFile:
        for w in oov:
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()

    # load word2vec - only the words
    print('loading word2vec embeddings...')
    voc = set()
    f = open('/mnt/data/wikipedia/embeddings/GoogleNews-vectors-negative300.vec')
    begin = True
    for line in f:
        if begin:
            begin = False
        else: 
            values = line.split()
            word = ' '.join(values[:-300])
            voc.add(word)
    f.close()
    print('word2vec embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('word2vec embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-w2v.txt", "w") as oovFile:
        for w in oov:    
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()

     # load numberbatch - only the words
    print('loading numberbatch embeddings...')
    voc = set()
    f = open('/mnt/data/wikipedia/embeddings/numberbatch-en-17.06.txt')
    begin = True
    for line in f:
        if begin:
            begin = False
        else: 
            values = line.split()
            word = ' '.join(values[:-300])
            voc.add(word)
    f.close()
    print('numberbatch embeddings:', len(voc), 'words')

    oov = []
    for tokenStr in word_index:
        if not tokenStr in voc:
            oov.append(tokenStr)

    print('numberbatch embeddings:', len(oov), 'out-of-vocabulary')

    with open("../data/training/oov-numberbatch.txt", "w") as oovFile:
        for w in oov:    
            oovFile.write(w)
            oovFile.write('\n')
    oovFile.close()
            embedding_matrix[i] = word2vec.word_vec(word)
    return embedding_matrix


def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_val


MAX_LEN = 300
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

embedding_models = [etnlp_word2vec, sonvx_word2vec]
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in embedding_models], axis=-1)

LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS

def get_token(text_list,num_words = VOCAB_SIZE):
    tokenizer = text.Tokenizer(num_words)
    tokenizer.fit_on_texts(text_list)
    return tokenizer
Beispiel #25
0
print(x.shape)
x[8712] = ' '

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Input, Dropout
from tensorflow.keras.layers import Conv1D, AveragePooling1D, MaxPooling1D
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.metrics import AUC
from tensorflow.keras.optimizers import Adam

maxlen = 1000
max_words = 20000
embedding_dim = 50

tokenizer = text.Tokenizer(
    filters='0123456789!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    num_words=max_words)
tokenizer.fit_on_texts(x)
x_train = tokenizer.texts_to_sequences(x)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
print(x_train.shape)
y_train = y

inp = Input(shape=(maxlen, ))
x = Embedding(max_words, embedding_dim, input_length=maxlen)(inp)
x = Conv1D(32, 3, strides=1, padding='same', activation='relu')(x)
x = MaxPooling1D(3)(x)
x = LSTM(32)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
Beispiel #26
0
    def __init__(self,
                 filename,
                 name='data_source',
                 input_name='input',
                 output_Name='reference',
                 vocab_size=100,
                 batch_size=200,
                 skip=500):
        with tf.name_scope(name):
            # Create tokenizer to map the strings onto sequences
            self.tokenizer = kpt.Tokenizer(num_words=vocab_size,
                                           filters='',
                                           lower=False,
                                           split='',
                                           char_level=True,
                                           oov_token=None)

            # Fit tokenizer on text
            with open(filename, 'r') as file:
                text = file.read()
            self.tokenizer.fit_on_texts([text])

            # Store reverse mapping
            self.reverse_mapping = {
                v: k
                for k, v in self.tokenizer.word_index.items()
            }

            # Create primary dataset
            # This dataset contains a rank-0 tensor (scalar) per line
            # Each such tensor is of type 'string'
            filename = "iliad.txt"
            dataset = tf.data.TextLineDataset(filename).skip(skip).repeat()

            # Convert string dataset into a binary category dataset
            # Each ()-tensor (containing one string) is created into a (l, v) tensor where
            #  - l is the length of the string
            #  - v is the size of the vocabulary
            # where output[i, j] == 1 if tokenizer.word_index[input[i]] == tokenizer.tokens[j] else 0
            # This is done by wrapping the tokenizer's "texts_to_matrix"-method in a "py_func" tensorflow op
            # (which maps a single entry) and calling dataset.map with a function that returns a single such op
            # (and sets the shape of the output tensor)
            def tokenize_op(x):
                def tokenize(y):
                    # Split the string into single-character lists (using the 'list' constructor)
                    # and call the texts_to_matrix method.
                    matrix = self.tokenizer.texts_to_matrix(
                        list(y.decode('utf-8')))

                    # Convert to float type
                    return matrix.astype(np.float32)

                # Wrap a call to the tokenize function with a float32 result
                out = tf.py_func(
                    tokenize,  # Target function
                    [x],  # Arguments
                    (
                        tf.float32
                    ),  # Return type (must be specified in advance because the function is called on demand)
                    False  # Whether this operation is stateful
                )

                # Add some shape information on the output for the tensorflow shape inference engine
                out.set_shape([tf.Dimension(None), vocab_size])
                return out

            matrix_dataset = dataset.map(tokenize_op)

            # Batch data together by padding all sequences to the longest one
            # In this case the datasets have shapes (l_i, vocab_size) and the operation produces
            # the shapes (batch_size, max(l_i), vocab_size)
            batched_dataset = matrix_dataset.padded_batch(
                batch_size, (-1, vocab_size))

            # Create input and output data by making datasets that take all but the last resp. the first element
            input_dataset = batched_dataset.map(lambda x: x[:, 0:-1, :])

            output_dataset = batched_dataset.map(lambda x: x[:, 1:, :])

            # Create iterators
            input_iterator = input_dataset.make_initializable_iterator()
            output_iterator = output_dataset.make_initializable_iterator()

            # Create action that just runs both initializer ops
            init_actions = [
                input_iterator.initializer, output_iterator.initializer
            ]
            with (tf.control_dependencies(init_actions)):
                self.initializer = tf.no_op('Initializer')

            # See below
            input_raw = input_iterator.get_next()
            output_raw = output_iterator.get_next()

        # Create action that retrieves data
        # Do this outside of the name scope so that people see the nodes
        # nicely next to the data-source node coming out of it
        # The nodes are each control-dependency linked to each other's sources so that the
        # data-sets always step forward together, even if only one is used.
        with tf.control_dependencies([input_raw, output_raw]):
            self.data = (tf.identity(input_raw, name=name + '_input'),
                         tf.identity(output_raw, name=name + '_ouutput'))
Beispiel #27
0
def get_lang_tokenize(texts):
    lang_tokenizer = text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(texts)

    return lang_tokenizer
Beispiel #28
0
from df.loader import load_polish_train, load_forum_hate_speech

# xlsx from https://github.com/ybalcanci/Hate-Speech-Detector
df = load_polish_train()
df = df.append(load_forum_hate_speech(), ignore_index=True)

from preprocessing.pl.text_preprocessing import TextPreprocessor

TextPreprocessor().clean_data_frame(df, lemmatize=True)

train_posts = df['tweet']
train_tags = df['label']

max_words = 1000
# keras.preprocessing
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts)  # only fit on train

with open(r"..\..\model\pl\rnn_tokenizer", 'wb') as handle:
    pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)

x_train = tokenize.texts_to_matrix(train_posts)

encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)

num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)

# Build the model
Beispiel #29
0
            i = t.find('\n\n')  # skip header
            if 0 < i:
                t = t[i:]
            texts.append(t)
        labels.append(label_id)

print('Found %s texts.' % len(texts))

# ### Vectorization
#
# Vectorize the text samples into a 2D integer tensor.

MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000

tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# ### TF Datasets
#
# Let's now define our TF Datasets
from tensorflow.keras.preprocessing import text, sequence

data_set = pd.read_csv('data_clean.csv')

#print(data_set.head())
data_train = data_set['text'][0:2000].astype(str)
data_labels = data_set['isReal'][0:2000].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(data_train,
                                                    data_labels,
                                                    test_size=0.25)

max_features = 1000
maxlen = 300

start = time.time()
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)
print('keras tokenize time: ', round(time.time() - start, 2), 's')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau

batch_size = 64