Exemple #1
0
def tokenlize_text(max_num_words, max_seq_length, x_train):
    """Tokenlize text.

    Vectorize a text corpus by transform each text in texts to a sequence of integers.

    Args:
        max_num_words: Int, max number of words in the dictionary.
        max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer.
        x_train: List contains text data.

    Returns:
        x_train: Tokenlized input data.
        word_index: Dictionary contains word with tokenlized index.
    """
    from keras_preprocessing.sequence import pad_sequences
    from keras_preprocessing.text import Tokenizer
    print("tokenlizing texts...")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(x_train)
    sequences = tokenizer.texts_to_sequences(x_train)
    word_index = tokenizer.word_index
    x_train = pad_sequences(sequences, maxlen=max_seq_length)
    print("data readed and convert to %d length sequences" % max_seq_length)
    return x_train, word_index
pool_size = 4

# RNN
rnn_output_size = 70

# Training
batch_size = 256
epochs = 5

print('Loading data...')
(x_train, y_train), (x_val, y_val), (x_test,
                                     y_test) = sentiment_140_neg.load_data()

print('Fitting tokenizer...')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((x_train, x_val, x_test)))

print('Convert text to sequences')
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

print(len(x_train), 'train sequences')
print(len(x_val), 'validation sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = sequence.pad_sequences(x_val, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
Exemple #3
0
import matplotlib.pyplot as plt
import numpy as np

from tensorflow import keras
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

data = open('archive/irish-lyrics-eof.txt').read()
corpus = data.lower().split("\n")

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(
    pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

xs = input_sequences[:, :-1]
labels = input_sequences[:, -1]

ys = keras.utils.to_categorical(labels, num_classes=total_words)
Exemple #4
0
from keras.models import load_model
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
import numpy as np

model = load_model('sentiment_model.h5')
test_data = ["A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"]
max_features = 200
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(test_data)
X = tokenizer.texts_to_sequences(test_data)
max_len = 28
X = pad_sequences(X, maxlen=max_len)
class_names = ['positive', 'negative']
preds = model.predict(X)
print(preds)
classes = model.predict_classes(X)
print(classes)
print(class_names[classes[0]])
                            encoding='utf-8')
train_movie_df = pd.read_csv('movie_reviews/train.tsv',
                             delimiter='\t',
                             encoding='utf-8')

train_movie_df = train_movie_df.drop(columns=['PhraseId', 'SentenceId'])
test_movie_df = test_movie_df.drop(columns=['PhraseId', 'SentenceId'])

train_movie_df['Phrase'] = train_movie_df['Phrase'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower()))
test_movie_df['Phrase'] = test_movie_df['Phrase'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower()))

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(train_movie_df['Phrase'].values)
X_train = tokenizer.texts_to_sequences(train_movie_df['Phrase'].values)
X_train = pad_sequences(X_train)

tokenizer.fit_on_texts(test_movie_df['Phrase'].values)
X_test = tokenizer.texts_to_sequences(test_movie_df['Phrase'].values)
X_test = pad_sequences(X_train)
print("handing data")

embed_dim = 128
lstm_out = 196


def create_model():
    sequential_model = Sequential()
    sequential_model.add(
Exemple #6
0
#train_df = pd.concat([pos_df, neg_df])
train_df = pd.concat([pos_df, neg_df, neu_df])
train_df = train_df.reset_index(drop=True)

x = train_df['Comment'].values
y = train_df['Sentiment Rating'].values
print(train_df.shape)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=22)

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(train_df['Comment'])

X_train = tokenizer.texts_to_sequences(x_train)
X_test = tokenizer.texts_to_sequences(x_test)

seq_lens = [len(s) for s in X_train]
print(max(seq_lens))
#pypl.hist(seq_lens,bins=50)
pypl.hist([l for l in seq_lens if l < 200], bins=50)
pypl.show()

X_train = keras.preprocessing.sequence.pad_sequences(X_train,
                                                     padding='post',
                                                     maxlen=150)
X_test = keras.preprocessing.sequence.pad_sequences(X_test,
                                                    padding='post',
Exemple #7
0
def build_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
df["text"] = df["text"].apply(clean_text)

#corpus=[]
#for index in range(len(df["headlines"])):
#   corpus.append(preprocessing.text_preprocessing(df["headlines"][index]))


df.drop_duplicates(subset=["text"], inplace=True)
df.dropna(inplace=True)
print(df.head)
voc_size=5000
sent_length=100
X_train, X_test, y_train, y_test = train_test_split(df.text, df.target, test_size=0.3, random_state=37)
tk = Tokenizer(num_words=10000,
                  filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', lower=True, split=" ")
tk.fit_on_texts(X_train)
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=100)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=100)
model = Sequential()  # initilaizing the Sequential nature for CNN model
print(len(tk.index_word))

model.add(Embedding(len(tk.index_word), 32, input_length=100))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile( loss='binary_crossentropy',optimizer='adam',
                 metrics=['accuracy'])

print(X_train_seq_trunc)
Exemple #9
0
test_label = np.load(open(BASE_DIR + "/test_label.npy")).tolist()

train_label_encoder = preprocessing.LabelEncoder()
train_label_encoder.fit(train_label)

joblib.dump(train_label_encoder, DATA_DIR + '/label_encoder.pkl')

train_label = train_label_encoder.transform(train_label)
test_label = train_label_encoder.transform(test_label)

label_dict = dict(
    zip(list(train_label_encoder.classes_),
        train_label_encoder.transform(list(train_label_encoder.classes_))))
print('[INFO] Label dict:', label_dict)
tokenizer = Tokenizer(MAX_NB_WORDS)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

word_index = tokenizer.word_index
print('[INFO] Found %s unique word tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_label = to_categorical(np.asarray(train_label))
print('[INFO] Shape of data tensor:', data.shape)
print('[INFO] Shape of label tensor:', train_label.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]  # rearrange train text to shuffled indices
train_label = train_label[indices]  # rearrange test text to shuffled indices
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
Exemple #11
0
    #plt.ylim([0, 10])
    plt.xlabel('Epoch')
    plt.ylabel('Error [TPSA]')
    plt.legend()
    plt.grid(True)
    plt.savefig(filename)

os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ["CUDA_VISIBLE_DEVICES"]="0"

df = pd.read_csv("sample_training_tpsa_caw.csv")
df = df.sample(frac=1)
texts = df.iloc[:,0].to_list()

tk =  Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(texts)
print(tk.word_index)
print("word index len: ", len(tk.word_index))

sequences = tk.texts_to_sequences(texts)
#print(texts[0])
#print(sequences[0])

lens = [len(x) for i, x in enumerate(sequences)]
#print(lens)
print("max: ", max(lens))
sum_ser = reduce(lambda x, y: x + y, lens)
print("sum ", sum_ser)
avg_len = (sum_ser * 1.0)/(len(lens))
print("avg_len: ", avg_len)
Exemple #12
0
with open("dynamic_feature_train.csv.pkl", "rb") as f:
    labels = pickle.load(f)  #train 类别
    files = pickle.load(f)  #files 文件api

vectorizer = TfidfVectorizer(
    ngram_range=(1, 5),
    min_df=3,
    max_df=0.9,
)  # tf-idf特征抽取ngram_range=(1,5),如果词的df超过某一阈值则被词表过滤
train_features = vectorizer.fit_transform(files)  #将文本中的词语转换为词频矩阵 ,先拟合数据再标准化

tfidftransformer_path = 'tfidf_transformer.pkl'
with open(tfidftransformer_path, 'wb') as fw:
    pickle.dump(vectorizer, fw)

#deep learning

with open("dynamic_feature_test.csv.pkl", "rb") as f:
    test_labels = pickle.load(f)
    outfiles = pickle.load(f)

tokenizer = Tokenizer(num_words=None,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      split=' ',
                      char_level=False,
                      oov_token=None)  #Tokenizer是一个用于向量化文本,或将文本转换为序列

tokenizer.fit_on_texts(files)
tokenizer.fit_on_texts(outfiles)

pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
Exemple #13
0
enc = LabelEncoder()
enc.fit(training_labels)
training_labels = enc.transform(training_labels)

##print(training_labels)

vocab_size = 10000
embedding_dim = 16
max_len = 20
trunc_type = 'post'
#padding = 'pre'
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)

padded = pad_sequences(sequences, truncating=trunc_type, maxlen=max_len)
classes = len(labels)
'''
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_len))

model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))

model.add(tf.keras.layers.Dense(classes, activation='softmax'))
Exemple #14
0
    x_train['consumer_review'], x_train['polarity_label'], test_size=0.3)

#converting to an array
x_train = (x_train.values.tolist())

x_test = (x_test.values.tolist())

y_train = (y_train.values.tolist())

y_test = (y_test.values.tolist())

#tokenizer

tokenizer = Tokenizer()

tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

total_size = len(word_index) + 1

print(total_size)

#texts to sequence
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

#add padding to ensure the same lenght

max_length = 100
Exemple #15
0
batch_size = 128

if __name__ == '__main__':
    df = pd.read_csv("data/fake-news-pair-classification-challenge/train.csv",
                     nrows=40000)
    train = df.loc[:, ['title1_zh', 'title2_zh', 'label']]

    # 1. 分词,去停用词,去标点
    train['title1_tokenized'] = train['title1_zh'].apply(cut_word)
    train['title2_tokenized'] = train['title2_zh'].apply(cut_word)

    # 2. 构建词典,padding
    x = pd.concat([train['title1_tokenized'], train['title2_tokenized']])
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(x)
    encoded1 = tokenizer.texts_to_sequences(train['title1_tokenized'])
    encoded2 = tokenizer.texts_to_sequences(train['title2_tokenized'])
    input_len = 25
    pad1 = pad_sequences(encoded1, maxlen=input_len)
    pad2 = pad_sequences(encoded2, maxlen=input_len)
    label = {'unrelated': 0, 'agreed': 1, 'disagreed': 2}
    y = train['label'].apply(lambda x: label[x])

    x1_train_all, x1_test, x2_train_all, x2_test, y_train_all, y_test = train_test_split(
        pad1, pad2, y)
    x1_train, x1_val, x2_train, x2_val, y_train, y_val = train_test_split(
        x1_train_all, x2_train_all, y_train_all)

    x1_train = tf.convert_to_tensor(x1_train, dtype=tf.float32)
    x2_train = tf.convert_to_tensor(x2_train, dtype=tf.float32)