Example #1
0
 def create_tokenizer(self, sentences):
     tokenizer = Tokenizer(num_words=self.MAX_VOCAB_SIZE)
     tokenizer.fit_on_texts(sentences)
     return tokenizer
Example #2
0
from keras.layers.embeddings import Embedding

np.random.seed(0)

#load data
train_df = pd.read_csv('train.tsv', sep='\t', header=0)

all_data = train_df.drop(['SentenceId', 'PhraseId', 'Sentiment'], axis=1)
all_class = train_df['Sentiment']

#prepare data
all_data['Phrase'] = all_data['Phrase'].apply(lambda x: x.lower())
all_data['Phrase'] = all_data['Phrase'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(all_data['Phrase'].values)
X = tokenizer.texts_to_sequences(all_data['Phrase'].values)
X = pad_sequences(X)

#encode classes
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(train_df['Sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

embed_dim = 128
#CNN model
    train_df, local_test_df = (train_df.iloc[:-n_test].reset_index(drop=True),
                               train_df.iloc[-n_test:].reset_index(drop=True))
else:
    local_test_df = pd.DataFrame([[None, "", 0], [None, "", 0]], columns=['qid', 'question_text', 'target'])
    n_test = 2

for df in [train_df, test_df, local_test_df]:
    df["question_text"] = df["question_text"].str.lower()
    df["question_text"] = df["question_text"].apply(lambda x: clean_text(x))
    df["question_text"].fillna("_##_", inplace=True)

x_train = train_df["question_text"].values
x_test = test_df["question_text"].values
x_test_local = local_test_df["question_text"].values

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train) + list(x_test_local))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_test_local = tokenizer.texts_to_sequences(x_test_local)

x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
x_test_local = pad_sequences(x_test_local, maxlen=maxlen)

y_train = train_df['target'].values
y_test = local_test_df['target'].values

glove_embeddings = load_glove(tokenizer.word_index, len(tokenizer.word_index) + 1)
paragram_embeddings = load_para(tokenizer.word_index, len(tokenizer.word_index) + 1)
from keras import layers
from keras.callbacks import EarlyStopping, ModelCheckpoint

# 乱数シードを設定
np.random.seed(0)

# 利用したい特徴量の数を指定
number_of_features = 1000

# 映画批評のデータとターゲットベクトルをロード
(data_train,
 target_train), (data_test,
                 target_test) = imdb.load_data(num_words=number_of_features)

# 映画批評データをワンホットエンコードで特徴量行列に変換
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

# ニューラルネットワークの作成を開始
network = models.Sequential()

# 活性化関数としてReLUを用いる全結合層を追加
network.add(
    layers.Dense(units=16,
                 activation="relu",
                 input_shape=(number_of_features, )))

# 活性化関数としてReLUを用いる全結合層を追加
network.add(layers.Dense(units=16, activation="relu"))
Example #5
0
    'dropout_rate': 0.1,
    'kernel_initializer': 'lecun_normal',
    'optimizer': 'sgd'
}

print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

sw_manager = StopwatchManager(stop_watch, compile_stop_watch)

print('\nBuilding network 1...')
Example #6
0
def wordvector_training_data(lang='deu', n=700, data_paths=()):
    df = get_data(lang)
    n = int(len(df) * n) if n <= 1 else n
    n = min(len(df), n)
    df = df.iloc[:n]
    input_texts, target_texts = [], []  # <1>
    input_vocabulary = set()  # <3>
    output_vocabulary = set()
    start_token, stop_token = '<START>', '<STOP>'
    input_tokenizer, output_tokenizer = Tokenizer(), Tokenizer()
    wv = get_data('word2vec')
    EMBEDDING_DIM = len(wv['the'])

    for input_text, target_text in tqdm(zip(df.eng, df[lang]), total=n):
        target_text = start_token + target_text + stop_token
        input_texts.append(input_text)
        target_texts.append(target_text)

    # texts = input_texts + target_texts
    # assert(len(texts) == n * 2)
    # input_texts = texts[:n]
    # target_texts = texts[n:]

    input_tokenizer.fit_on_texts(input_texts)
    output_tokenizer.fit_on_texts(target_texts)
    input_sequences = input_tokenizer.texts_to_sequences(input_texts)
    target_sequences = output_tokenizer.texts_to_sequences(target_texts)
    input_sequences = pad_sequences(input_sequences,
                                    maxlen=MAX_INPUT_SEQUENCE_LENGTH)
    target_sequences = pad_sequences(target_sequences,
                                     maxlen=MAX_TARGET_SEQUENCE_LENGTH)

    embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
    for w, i in input_tokenizer.word_index.items():
        if w in wv.vocab:
            embedding_matrix[i] = wv.word_vec(w)
    print('Null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix != 0, axis=1) == 0))

    #     input_vocabulary = sorted(input_vocabulary)  # <1>
    #     output_vocabulary = sorted(output_vocabulary)

    input_vocab_size = len(input_vocabulary)  # <2>
    output_vocab_size = len(output_vocabulary)
    max_encoder_seq_length = max([len(txt) for txt in input_texts])  # <3>
    max_decoder_seq_length = max([len(txt) for txt in target_texts])

    input_token_index = dict([
        (char, i) for i, char in enumerate(input_vocabulary)
    ])  # <4>
    target_token_index = dict([(char, i)
                               for i, char in enumerate(output_vocabulary)])

    encoder_input_data = np.zeros(
        (n, max_encoder_seq_length, input_vocab_size), dtype='float32')  # <2>
    decoder_input_data = np.zeros(
        (n, max_decoder_seq_length, output_vocab_size), dtype='float32')
    decoder_target_data = np.zeros(
        (n, max_decoder_seq_length, output_vocab_size), dtype='float32')
    for i, (input_text, target_text) in enumerate(
            tqdm(zip(input_texts, target_texts),
                 total=len(target_texts))):  # <3>
        for t, char in enumerate(input_text):  # <4>
            encoder_input_data[i, t, input_token_index[char]] = 1.  # <5>
        for t, char in enumerate(target_text):  # <6>
            decoder_input_data[i, t, target_token_index[char]] = 1.
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1

    trainset = (encoder_input_data, decoder_input_data, decoder_target_data)
    for i, p in enumerate(data_paths):
        np.save(p, trainset[i][:n], allow_pickle=False)

    return encoder_input_data, decoder_input_data, decoder_target_data
def get_lstm_predictions(X_train, X_test, y_train, y_test, use_glove=False):

    split = len(X_train)
    # Rough Avg number of words in each article. How long every sequence will be.
    MAX_SEQUENCE_LENGTH = 1000
    # This is fixed.
    EMBEDDING_DIM = 100

    X = X_train + X_test
    y = y_train + y_test
    tokenizer = Tokenizer(lower=True)
    tokenizer.fit_on_texts(X)
    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1
    print('Found %s unique tokens.' % len(word_index))
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', X.shape)

    # one hot encoding the labels
    y_one_hot = []
    for i in y:
        temp = np.zeros(23)
        temp[i - 1] = 1
        y_one_hot.append(temp)

    y_one_hot = np.array(y_one_hot)

    # splitting the train test
    X_train = X[:split]
    X_test = X[split:]
    y_train = y_one_hot[:split]
    y_test = y_one_hot[split:]

    # GloVe

    if use_glove:
        embeddings_index = dict()
        f = open('glove.6B.100d.txt')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        print('Loaded %s word vectors.' % len(embeddings_index))
        # create a weight matrix for words in training docs
        embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

    # construction of the lstm
    model = Sequential()
    if use_glove:
        model.add(Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], input_length=X.shape[1]))
    else:
        model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(23, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    # running the model
    epochs = 20
    batch_size = 64
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test),
                        callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

    # plotting
    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show();

    plt.title('Accuracy')
    plt.plot(history.history['acc'], label='train')
    plt.plot(history.history['val_acc'], label='test')
    plt.legend()
    plt.show();

    # getting predictions
    predictions = model.predict(X_test)
    predictions = np.array([np.argmax(i) + 1 for i in predictions])

    if use_glove:
        np.save('predictions/lstm_glove_predictions', predictions)
    else:
        np.save('predictions/lstm_predictions', predictions)
Example #8
0
from keras.models import Model
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, RepeatVector, TimeDistributed, Bidirectional
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD
from importlib import reload

num_words = 2000

input_tokenizer = Tokenizer(num_words=num_words,
                            lower=False,
                            split=' ',
                            filters='')
input_tokenizer.fit_on_texts(tokenized_xtext)
input_word_index = input_tokenizer.word_index
target_tokenizer = Tokenizer(num_words=num_words,
                             lower=False,
                             split=' ',
                             filters='')
target_tokenizer.fit_on_texts(tokenized_ytext)
target_word_index = target_tokenizer.word_index
text_tokenizer = Tokenizer(num_words=num_words,
                           lower=False,
                           split=' ',
                           filters='')
text_tokenizer.fit_on_texts(tokenized_xtext + tokenized_ytext)
word_index = text_tokenizer.word_index
Example #9
0
        f.write(
            'conv_units=%d , lr=%.5f , c_drop=%.2f , maxlen=%d , embed_len=%d , conv_layers=%d\n'
            % (conv_units, lr, drop, maxlen, maxlen, conv_layers))
    with open('_' + log_dest, 'a') as f:
        f.write(
            'conv_units=%d , lr=%.5f , c_drop=%.2f , maxlen=%d , embed_len=%d , conv_layers=%d\n'
            % (conv_units, lr, drop, maxlen, maxlen, conv_layers))
    print(model.summary())
    return model


# fix random seed for reproducibility

X_train, y_train, X_dev, y_dev = get_data('train.txt', 'dev.txt')
print('Tokenization')
tokenizer = Tokenizer(lower=False, char_level=True, filters='0123456789')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_dev_seq = tokenizer.texts_to_sequences(X_dev)
print(tokenizer.word_index)
print('Padding')
X_train = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_dev = pad_sequences(X_dev_seq, maxlen=maxlen, padding='post')
print(X_train[0])
print('One-hot Encoding')

encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_dev = encoder.transform(y_dev)
y_train = np.array([to_categorical(y, 5) for y in y_train])
train_cl = df[:train.shape[0]]
test_cl = df[train.shape[0]:]

list_classes = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
y_tr = train_cl[list_classes].values
list_sentences_train = train_cl.comment_text
list_sentences_test = test_cl.comment_text

print("....At....Tokenizer")

puncuate = r'([\.\!\?\:\,])'

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_features, oov_token=puncuate)
tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

totalNumWords = [len(one_comment) for one_comment in list_tokenized_train]
print("mean length:" + str(np.mean(totalNumWords)))
print("max length:" + str(max(totalNumWords)))
print("std length:" + str(np.std(totalNumWords)))

print(" maxlen is:" + str(maxlen))

print("number of different word:" + str(len(tokenizer.word_index.items())))

if len(tokenizer.word_index.items()) < max_features:
Example #11
0
print('Indexing word vectors.')

embeddings_index = {}
f = open('../glove.6B/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# In[5]:

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=False)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
print('Shape of data tensor:', data.shape)

# In[7]:

# import nltk
# from keras.preprocessing.text import text_to_word_sequence
# raw_output = corpus.findall('.//sentence')
# train_out= np.zeros(shape=(3044,69))
with open("text8.txt", "r") as f:
    text = f.read()

#TODO: is the text_to_word_sequence step required or is the data already pre processed
filter_string = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
#TODO: tokenization cleanup
text = text_to_word_sequence(text,
                             filters=filter_string,
                             lower=True,
                             split=" ")
#text = tokenize.sent_tokenize(text)
#print text
print 'No of words in corpus: {}'.format(len(text))
tokenizer = Tokenizer(num_words=VOCAB_SIZE,
                      filters=filter_string,
                      lower=True,
                      split=" ",
                      char_level=False)
tokenizer.fit_on_texts(text)
word_to_int = {}
temp_word_to_int = tokenizer.word_index
for k, v in temp_word_to_int.iteritems():
    if (v > 0 and v <= VOCAB_SIZE):
        word_to_int[k] = v

#for k, v in word_to_int.iteritems():
#    print k,v
int_to_word = dict(zip(word_to_int.values(), word_to_int.keys()))
f = open('int_to_word_dict.txt', 'w')
simplejson.dump(int_to_word, f)
print 'No of unique words: {}'.format(len(word_to_int))
Example #13
0
essay_df = pd.read_csv('data.csv', encoding="latin-1")
all_descriptions = list(essay_df.desc.values)

#colnames = ['description']
#data = pd.read_csv('data.csv', names=colnames)
#all_description = data.description.tolist()

len(all_descriptions)

corpus = [x for x in all_descriptions]
corpus[:1]

t = Tokenizer(num_words=None,
              filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
              lower=True,
              split=' ',
              char_level=False,
              oov_token=None,
              document_count=0)
t.fit_on_texts(corpus)

# A dictionary of words and their counts.
print(t.word_counts)

# A dictionary of words and how many documents each appeared in.
print(t.word_docs)

# An integer count of the total number of documents that were used to fit the Tokenizer (i.e. total number of documents)
print(t.document_count)

# A dictionary of words and their uniquely assigned integers.
Example #14
0
    #df['%s_out' % i] = df[str(i)].str.pad(max_line_len, 'right', '\n') + ('\n' if i == 2 else df[str(i+1)].str[0])
    
    # TODO - trying to add the next line's first character before the line breaks
    if i == 2: # If it's the last line
        df['%s_out' % i] = df[str(i)].str.pad(max_line_length+2, 'right', '\n')
    else: 
        # If it's the first or second line, add the first character of the next line to the end of this line.
        # This helps with training so that the next RNN has a better chance of getting the first character right.
        df['%s_out' % i] = (df[str(i)] + '\n' + df[str(i+1)].str[0]).str.pad(max_line_length+2, 'right', '\n')
    
max_line_length += 2

inputs = df[['0_in', '1_in', '2_in']].values


tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(inputs.flatten())
n_tokens = len(tokenizer.word_counts) + 1

print(df)

# X is the input for each line in sequences of one-hot-encoded values
X = np_utils.to_categorical([
  tokenizer.texts_to_sequences(inputs[:,i]) for i in range(3)
  ], num_classes=n_tokens)

outputs = df[['0_out', '1_out', '2_out']].values

# Y is the output for each line in sequences of one-hot-encoded values
Y = np_utils.to_categorical([
    tokenizer.texts_to_sequences(outputs[:,i]) for i in range(3)
Example #15
0
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
train_decoder_input = decoder_input[:int(n_rows*train_size)]
train_decoder_label = decoder_label[:int(n_rows*train_size)]

test_encoder_input = encoder_input[int(n_rows*train_size):]
test_decoder_input = decoder_input[int(n_rows*train_size):]
test_decoder_label = decoder_label[int(n_rows*train_size):]

print(train_encoder_input.shape)
print(train_decoder_input.shape)
print(train_decoder_label.shape)

print(test_encoder_input.shape)
print(test_decoder_input.shape)
print(test_decoder_label.shape)

q_tok = Tokenizer()
q_tok.fit_on_texts(train_encoder_input)
print(len(q_tok.word_counts))

a_tok = Tokenizer()
a_tok.fit_on_texts(train_decoder_input)
a_tok.fit_on_texts(train_decoder_label)
print(len(a_tok.word_counts))

train_encoder_input = q_tok.texts_to_sequences(train_encoder_input)
test_encoder_input = q_tok.texts_to_sequences(test_encoder_input)

train_decoder_input = a_tok.texts_to_sequences(train_decoder_input)
test_decoder_input = a_tok.texts_to_sequences(test_decoder_input)

train_decoder_label = a_tok.texts_to_sequences(train_decoder_label)
features = train['word_unique_vs_len'].fillna(0)
test_features = test['word_unique_vs_len'].fillna(0)
features = features.reshape(-1, 1)
test_features = test_features.reshape(-1, 1)
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

df_y_train = (train["class"]-1).astype(int)
test_id = test[["id"]].copy()

print ('pre_processed done')

column = "word_seg"
tknzr_word = Tokenizer(num_words=config.len_desc)
tknzr_word.fit_on_texts(all_data[column].values)


low_count_words = [w for w, c in tknzr_word.word_counts.items() if c < count_thres]
for w in low_count_words:
    del tknzr_word.word_index[w]
    del tknzr_word.word_docs[w]
    del tknzr_word.word_counts[w]

tr_word_seq = tknzr_word.texts_to_sequences(train[column].values)
te_word_seq = tknzr_word.texts_to_sequences(test[column].values)

tr_word_pad = pad_sequences(tr_word_seq, maxlen=config.maxlen)
te_word_pad = pad_sequences(te_word_seq, maxlen=config.maxlen)
Example #18
0
totalen = len(test.question1)
print('Test size: ', totalen)

train = pd.read_csv("../../quora-data/train.csv")
totalen = len(train.question1)
print('Train size: ', totalen)

text_material = test.question1 + test.question2 + train.question1 + train.question2
# text_material = train.question2[:1000]

train_material = []
for line in text_material:
    line = get_words(line)
    train_material.append(line)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(train_material)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

sorted_word_freqs = sorted(tokenizer.word_counts.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_word_freqs[-500:])

word_set = set([])
for ky, vl in tokenizer.word_counts.items():
    word_set.add(ky)
print('Total word in count %d' % len(word_set))

fasttext_set = set([])
with open('/data2/tonyq/quora-data/crawl-300d-2M.vec') as f:
     if str(1) in Query_label[m]:
         Query_label[m] = 1
     elif str(0) in Query_label[m]:
         Query_label[m] = 0
 # y= keras.utils.to_categorical(Query_label, num_classes=2)
 y = Query_label
 Query = Query_a + Query_b
 Q = []
 seq_maxlen = []
 for i in Query:
     seq_maxlen.append(len(i))
     Q.append(" ".join(jieba.cut(i)))
     # Q.append(" ".join(list(i)))
 # maxlen=max(seq_maxlen)
 maxlen = 60
 token = Tokenizer(num_words=None,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
 token.fit_on_texts(Q)
 sequences = token.texts_to_sequences(Q)
 wordindex = token.word_index
 data = pad_sequences(sequences, maxlen=maxlen)
 # labels = keras.utils.to_categorical([int(i) for i in Query_label], num_classes=2)
 for index in wordindex:
     wordindex[index] = wordindex[index] + 1
 wordindex['PAD'] = 0
 wordindex['UNK'] = 1
 output = open('./dic/vocab.pkl', 'wb')
 pickle.dump(wordindex, output)
 output.close()
 model = gensim.models.Word2Vec.load('./model/Q_w2v_new.model')
 embedding_matrix = np.zeros((len(wordindex) + 1, 300))
 for word, i2 in wordindex.items():
max_len = 150

y = train['Category'].values
y = to_categorical(y)

y_test = test['Category'].values
y_test = to_categorical(y_test)
train["title"].fillna("no comment")
test["title"].fillna("no comment")
X_train, X_valid, Y_train, Y_valid = train_test_split(train, y, test_size=0.1)

raw_text_train = X_train["title"].str.lower()
raw_text_valid = X_valid["title"].str.lower()
raw_text_test = test["title"].str.lower()

tk = Tokenizer(num_words=max_features, lower=True)
tk.fit_on_texts(raw_text_train)
X_train["comment_seq"] = tk.texts_to_sequences(raw_text_train)
X_valid["comment_seq"] = tk.texts_to_sequences(raw_text_valid)
test["comment_seq"] = tk.texts_to_sequences(raw_text_test)

X_train = pad_sequences(X_train.comment_seq, maxlen=max_len)
X_valid = pad_sequences(X_valid.comment_seq, maxlen=max_len)
test = pad_sequences(test.comment_seq, maxlen=max_len)


def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


embedding_index = dict(
Example #21
0
            else:
                f = open(wordpath, encoding='gbk', errors='ignore')

            text = f.read()
            seg_list = jie_test.cut(text.strip().replace(' ', ''),
                                    cut_all=False)
            new_content = " ".join(seg_list)
            texts.append(new_content)
            f.close()
            labels.append(label_id)
for tes in texts:
    print("------>>")
    print(tes)
print('Found %s texts.' % len(texts))

tokenizer = Tokenizer(num_words=max_nb_words, filters="", oov_token="unk")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=max_sequence_length)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])

for la in labels:
def neural_extractor1(data, categories, aspects, text_to_predict):

    for i in range(0, len(data)):
        flag = 0
        temp_sent = []
        text = data[i]

        words = nltk.word_tokenize(text)
        pos = []
        for word in nltk.pos_tag(words):
            parts_of_speech[word[1]] = 1
            pos.append(word[1])

        tags = ['O' for ff in range(0, len(words))]
        for aspect in aspects[i]:
            asp_words = nltk.word_tokenize(aspect.lower())

            j = 0
            k = 0
            # flag=0
            while (k < len(asp_words)):
                while (j < len(words)):
                    if (asp_words[k] == words[j] and tags[j] == 'O'):

                        if (k == 0):
                            tags[j] = 'B'
                        else:
                            tags[j] = 'I'
                        # if(flag==0):
                        # 	tags[j]='B'
                        # 	flag=1
                        # else:
                        # 	tags[j]='I'
                        k += 1
                        if (k >= len(asp_words)):
                            break
                    j += 1
                k += 1

        for ii in range(0, len(words)):
            temp_sent.append((words[ii], pos[ii], tags[ii]))
        sentences.append(temp_sent)
    print(len(sentences))

    for i in range(0, len(data)):
        tokens = nltk.word_tokenize(data[i])
        string = ' '.join(tokens)
        data[i] = string
    #data.append(' '.join(words_to_predict))
    #lll=len(data)-1
    data.append("ENDPAD")
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    sequences = tokenizer.texts_to_sequences(data)
    word_index = tokenizer.word_index

    X = pad_sequences(sequences[:-1],
                      maxlen=50,
                      padding="post",
                      value=word_index["endpad"])

    validation_size = int(0.2 * X.shape[0])
    #print(X_to_predict)

    n_words = len(word_index)

    tag_list = ['B', 'I', 'O', 'P']
    n_tags = len(tag_list)

    embedding_matrix = np.zeros((n_words, 300))

    for word, i in word_index.items():
        if (i >= len(word_index)):
            continue
        if word in glove_emb:
            embedding_matrix[i] = glove_emb[word]

    max_len = 50
    tag2idx = {t: i for i, t in enumerate(tag_list)}
    idx2word = {t: i for i, t in word_index.items()}
    pos2idx = {t: i for i, t in enumerate(parts_of_speech.keys())}

    y = [[tag2idx[w[2]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len,
                      sequences=y,
                      padding="post",
                      value=tag2idx["P"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]

    pos = [[pos2idx[w[1]] for w in s] for s in sentences]
    pos1 = pad_sequences(maxlen=max_len,
                         sequences=pos,
                         padding="post",
                         value=len(parts_of_speech.keys()) + 1)

    pos = np.asarray([np.reshape(i, (max_len, 1)) for i in pos1])

    # indices=np.arange(X.shape[0])
    # np.random.shuffle(indices)
    # X=X[indices]
    # y=y[indices]
    #validation_size=int(0.2*X.shape[0])

    X_tr = X[:-validation_size]
    tr_pos = pos[:-validation_size]
    y_tr = y[:-validation_size]
    X_te = X[-validation_size:]
    te_pos = pos[-validation_size:]
    y_te = y[-validation_size:]
    X_to_predict = X[-validation_size:]

    pos_to_predict = pos[-validation_size:]
    # X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

    vocab_size = len(word_index)
    e = Input(shape=(max_len, ))
    emb = Embedding(vocab_size,
                    300,
                    weights=[embedding_matrix],
                    input_length=50,
                    mask_zero=True,
                    trainable=False)(e)
    ad_pos = Input(shape=(max_len, 1))
    co_tm = Concatenate()([emb] + [ad_pos])
    bi_gru = Bidirectional(GRU(50, return_sequences=True))(emb)
    out = Dense(25, activation='relu')(bi_gru)
    # out=Dropout(0.1)(out)
    out = TimeDistributed(Dense(n_tags, activation='softmax'))(out)
    model = Model(inputs=[e, ad_pos], outputs=[out])
    model.compile(loss='categorical_crossentropy',
                  optimizer="rmsprop",
                  metrics=['accuracy'])

    model.fit([X, pos],
              np.array(y),
              batch_size=25,
              epochs=20,
              validation_data=([X_te, te_pos], np.array(y_te)),
              verbose=1)
    pos_tp = np.asarray([np.asarray([1 for i in range(0, 50)]).reshape(50, 1)])
    #model=load_model('aspect_extractor.h5')
    #with open('aspect.json') as ff:
    #	model_json=ff.read()
    #	model=keras.models.model_from_json(model_json)
    #model.compile(loss='categorical_crossentropy',optimizer="rmsprop",metrics=['accuracy'])
    #model.load_weights('aspect_weights.h5')
    #model.fit([X], np.array(y), batch_size=25, epochs=15, validation_data=([X_te],np.array(y_te)), verbose=0)
    #print(X_to_predict,X_to_predict.shape)
    p1 = model.predict([X_to_predict, pos_to_predict])
    #p1=model.predict([X_to_predict])
    #print(p1)
    pred_aspects = []
    for i in range(0, len(p1)):
        p = np.argmax(p1[i], axis=-1)
        temp1 = []
        flag = 0
        string1 = ""
        for j in range(0, len(p)):
            #print(idx2word[X_to_predict[i][j]],tag_list[p[j]])
            if (idx2word[X_to_predict[i][j]] == "endpad"):
                break
            if (tag_list[p[j]] == 'B'):
                string1 += idx2word[X_to_predict[i][j]] + " "
                if (flag == 0):
                    flag = 1
            elif (tag_list[p[j]] == 'I'):
                string1 += idx2word[X_to_predict[i][j]] + " "
            elif (tag_list[p[j]] == 'O'):
                if (string1 != ""):
                    temp1.append(string1)
                string1 = ""
                flag = 0
        pred_aspects.append(temp1)

    #print(pred_aspects)
    return pred_aspects

    # print(aspects[:-validation_size][69])

    # for i in range(0,20):
    # 	print(aspects[i],pred_aspects[i])

    # p=np.argmax(p,axis=-1)
    # true_p=np.argmax(y_tr[69],axis=-1)

    # for i in range(0,len(p)):
    # 	print(true_p[i],p[i])

    #for w, pred in zip(X_to_predict[0], p1[0]):
    #	print(idx2word[w], tag_list[pred])
from keras.utils.np_utils import to_categorical
from keras.layers.merge import concatenate
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Activation, Input
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D
from keras.layers import  BatchNormalization
from sklearn.model_selection import train_test_split
from data_helper_3c import load_data_and_labels
import matplotlib.pyplot as plt

good_data_file = "./data/good_cut_jieba.txt"
bad_data_file = "./data/bad_cut_jieba.txt"
mid_data_file = "./data/mid_cut_jieba.txt"
x_text, y = load_data_and_labels(good_data_file, bad_data_file, mid_data_file)

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
tokenizer.fit_on_texts(x_text)
vocab = tokenizer.word_index

x_train, x_test, y_train, y_test = train_test_split(x_text, y, test_size=0.2, random_state=2017)

x_train_word_ids = tokenizer.texts_to_sequences(x_train)
x_test_word_ids = tokenizer.texts_to_sequences(x_test)
x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=64)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=64)

#TextInception
main_input = Input(shape=(64,), dtype='float64')
embedder = Embedding(len(vocab) + 1, 256, input_length = 64)
embed = embedder(main_input)
    y_train[i][x] = 1
    i = i + 1

i = 0
y_test = np.zeros((len(label_test), max(label_test) + 1))
for x in label_test:
    y_test[i][x] = 1
    i = i + 1

i = 0
y_valid = np.zeros((len(label_valid), max(label_valid) + 1))
for x in label_valid:
    y_valid[i][x] = 1
    i = i + 1

t = Tokenizer()
t.fit_on_texts(input_train)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(input_train)
#print(encoded_docs)
# pad documents to a max length of 4 words
max_length = max(len_finder)
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
#print(padded_docs)
# load the whole embedding into memory
embeddings_index = dict()
f = open("G:\\NLP\\Dataset\\GloVe\\glove.6B.100d.txt", encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
Example #25
0
def main():
    parser = argparse.ArgumentParser(description="Baseline Script for SemEval")
    parser.add_argument('-config', help='Config to read details', required=True)
    args = parser.parse_args()

    with open(args.config) as configfile:
        config = json.load(configfile)
        
    global trainDataPath, testDataPath, gloveDir
    global NUM_FOLDS, NUM_CLASSES, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM
    global BATCH_SIZE, LSTM_DIM, DROPOUT, NUM_EPOCHS, LEARNING_RATE, PREPROCESS     
    
    trainDataPath = config["train_data_path"]
    testDataPath = config["test_data_path"]
    gloveDir = config["glove_dir"]
    
    NUM_FOLDS = config["num_folds"]
    NUM_CLASSES = config["num_classes"]
    MAX_NB_WORDS = config["max_nb_words"]
    MAX_SEQUENCE_LENGTH = config["max_sequence_length"]
    EMBEDDING_DIM = config["embedding_dim"]
    BATCH_SIZE = config["batch_size"]
    LSTM_DIM = config["lstm_dim"]
    DROPOUT = config["dropout"]
    LEARNING_RATE = config["learning_rate"]
    NUM_EPOCHS = config["num_epochs"]
    PREPROCESS = bool(config["preprocess"])
        
    print("Processing training data...")
    trainIndices, trainTexts, labels = preprocessData(trainDataPath, mode="train", preprocess=PREPROCESS)
    # Write normalised text to file to check if normalisation works. Disabled now. Uncomment following line to enable   
    # writeNormalisedData(trainDataPath, trainTexts)
    print("Processing test data...")
    testIndices, testTexts, testLabels = preprocessData(testDataPath, mode="train", preprocess=PREPROCESS)
    # writeNormalisedData(testDataPath, testTexts)

    print("Extracting tokens...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(trainTexts)
    trainSequences = tokenizer.texts_to_sequences(trainTexts)
    testSequences = tokenizer.texts_to_sequences(testTexts)

    wordIndex = tokenizer.word_index
    print("Found %s unique tokens." % len(wordIndex))

    print("Populating embedding matrix...")
    embeddingMatrix = getEmbeddingMatrix(wordIndex)

    data = pad_sequences(trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(labels))
    testLabels = to_categorical(np.asarray(testLabels))
    print("Shape of training data tensor: ", data.shape)
    print("Shape of label tensor: ", labels.shape)
        
    # Randomize data
    np.random.shuffle(trainIndices)
    data = data[trainIndices]
    labels = labels[trainIndices]
      
    # Perform k-fold cross validation
    metrics = {"accuracy" : [],
               "microPrecision" : [],
               "microRecall" : [],
               "microF1" : []}
    
    print("Starting k-fold cross validation...")
    for k in range(NUM_FOLDS):
        print('-'*40)
        print("Fold %d/%d" % (k+1, NUM_FOLDS))
        validationSize = int(len(data)/NUM_FOLDS)
        index1 = validationSize * k
        index2 = validationSize * (k+1)
            
        xTrain = np.vstack((data[:index1],data[index2:]))
        yTrain = np.vstack((labels[:index1],labels[index2:]))
        xVal = data[index1:index2]
        yVal = labels[index1:index2]
        print("Building model...")
        model = buildModel(embeddingMatrix)
        model.fit(xTrain, yTrain, 
                  validation_data=(xVal, yVal),
                  epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

        predictions = model.predict(xVal, batch_size=BATCH_SIZE)
        accuracy, microPrecision, microRecall, microF1 = getMetrics(predictions, yVal)
        metrics["accuracy"].append(accuracy)
        metrics["microPrecision"].append(microPrecision)
        metrics["microRecall"].append(microRecall)
        metrics["microF1"].append(microF1)
        
    print("\n============= Metrics =================")
    print("Average Cross-Validation Accuracy : %.4f" % (sum(metrics["accuracy"])/len(metrics["accuracy"])))
    print("Average Cross-Validation Micro Precision : %.4f" % (sum(metrics["microPrecision"])/len(metrics["microPrecision"])))
    print("Average Cross-Validation Micro Recall : %.4f" % (sum(metrics["microRecall"])/len(metrics["microRecall"])))
    print("Average Cross-Validation Micro F1 : %.4f" % (sum(metrics["microF1"])/len(metrics["microF1"])))
    
    print("\n======================================")
    
    print("Retraining model on entire data to create solution file")
    model = buildModel(embeddingMatrix)
    model.fit(data, labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
    model.save('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))
    # model = load_model('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE))

    print("Creating solution file...")
    testData = pad_sequences(testSequences, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(testData, batch_size=BATCH_SIZE)

    print("Results on test data...")
    import utils
    utils.getMetrics(predictions, testLabels)
    
    print("Completed. Model parameters: ")
    print("Learning rate : %.3f, LSTM Dim : %d, Dropout : %.3f, Batch_size : %d" 
          % (LEARNING_RATE, LSTM_DIM, DROPOUT, BATCH_SIZE))
Example #26
0
len(embeddings_index) #15,523 words

# load .bin 
#from gensim.models import KeyedVectors
#word_vectors = KeyedVectors.load_word2vec_format(os.path.join(project_root, "class_nn/archive_corpus_w2v_model.bin"), binary=True)

# Model will take in a group of sentences per class 
# convert  into tokenized vector
# You will have a tokenized vector for each entry of varying sizes
#sentences = df['stem_text'].values # include stopwords, stemmed
sentences = df['clean_text'] # include stopwords, unstemmed
y = df['just_categories']
df['just_categories'].values

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index # word and their token # ordered by most frequent
print('Found %s unique tokens.' % len(word_index))

max_words = 5620 # total words of vocabulary we will consider

num_words = [len(words.split()) for words in sentences]
max_seq_len = max(num_words) + 1

# More than one way of doing this
# another way
import string
from nltk.tokenize import word_tokenize
Example #27
0
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np 

tokenizer = Tokenizer()

def dataset_preparation(data):

	# basic cleanup
	corpus = data.lower().split("\n")

	# tokenization	
	tokenizer.fit_on_texts(corpus)
	total_words = len(tokenizer.word_index) + 1

	# create input sequences using list of tokens
	input_sequences = []
	for line in corpus:
		token_list = tokenizer.texts_to_sequences([line])[0]
		for i in range(1, len(token_list)):
			n_gram_sequence = token_list[:i+1]
			input_sequences.append(n_gram_sequence)

	# pad sequences 
	max_sequence_len = max([len(x) for x in input_sequences])
	input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
from keras.callbacks.callbacks import EarlyStopping
from keras.models import load_model, Sequential
from keras.layers import Dense, LSTM
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import numpy as np
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC

ratings = []
features = []
vectorizer = CountVectorizer()
t = Tokenizer()

def is_number(some_string):
    try: 
        int(some_string)
        return True
    except ValueError:
        return False

def read_ratings_information(ratings_file):
    with open(ratings_file, 'r', encoding='utf-8') as fs:
        lines = fs.readlines()
    for line in lines:
        # first capture diagnostic code
        parts = line.strip().split('   ')
        diagnostic_code = parts[0]
Example #29
0
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))


# Format text samples and labels into tensors 

os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
Example #30
0
model._make_predict_function()
words = pd.read_csv("scores_latest.csv")
jsfile=open("data.json","r")
data = json.loads(jsfile.read())
data = {float(k):v for k,v in data.items()}
print('initial:', data)

pos = 0.2
neg = 0.2
c=0
# put the path sidarth
data1 = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data1 = data1[['text', 'sentiment']]

tokenizer = Tokenizer(num_words=8000, split=' ')
tokenizer.fit_on_texts(data1['text'].values)

'''for j in range(len(words)):
    i = words["words"][j]
    i = i.lower()
    score = words["sentiment"][j]
    if score > 0.8:
        data[0.8][i] = 0
    elif score > 0.6:
        data[0.6][i] = 0
    elif score > 0.4:
        data[0.4][i] = 0
    elif score > 0.2:
        data[0.2][i] = 0
    elif score > 0.0: