Ejemplo n.º 1
0
def text_encode(train, val, test, type='onehot', maxlen=20):
    label_dict = {'b': 0, 't': 1, 'e': 2, 'm': 3}
    train_label = train['CATEGORY']
    val_label = val['CATEGORY']
    test_label = test['CATEGORY']
    for (key, value) in label_dict.items():
        train_label = train_label.replace(key, value)
        val_label = val_label.replace(key, value)
        test_label = test_label.replace(key, value)
    train_label = to_categorical(train_label, num_classes=4)
    val_label = to_categorical(val_label, num_classes=4)
    test_label = to_categorical(test_label, num_classes=4)

    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True,
                          split=" ")
    tokenizer.fit_on_texts(train['TITLE'])
    word_index = tokenizer.word_index
    vocab = tokenizer.word_index
    train_id = tokenizer.texts_to_sequences(train['TITLE'])
    val_id = tokenizer.texts_to_sequences(val['TITLE'])
    test_id = tokenizer.texts_to_sequences(test['TITLE'])
    if type == 'seq':
        train_id = pad_sequences(train_id, padding='post', maxlen=maxlen)
        val_id = pad_sequences(val_id, padding='post', maxlen=maxlen)
        test_id = pad_sequences(test_id, padding='post', maxlen=maxlen)
        return train_id, train_label, val_id, val_label, test_id, test_label, vocab, word_index
    else:
        train_onehot = tokenizer.sequences_to_matrix(train_id, mode='binary')
        val_onehot = tokenizer.sequences_to_matrix(val_id, mode='binary')
        test_onehot = tokenizer.sequences_to_matrix(test_id, mode='binary')
        return train_onehot, train_label, val_onehot, val_label, test_onehot, test_label, vocab, word_index
Ejemplo n.º 2
0
def train_population(population):
    # Initialize the data set
    (X_train, y_train), (X_test,
                         y_test) = reuters.load_data(num_words=max_words)
    num_classes = np.max(y_train) + 1
    tokenizer = Tokenizer(num_words=max_words)
    X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
    X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    # For graph purposes
    histories = []

    for neural_network in population:
        # Create the model
        keras_model = create_keras_model(neural_network, num_classes)
        print(neural_network)
        # Train it
        history = keras_model.fit(X_train,
                                  y_train,
                                  batch_size=128,
                                  epochs=20,
                                  verbose=2,
                                  validation_data=(X_test, y_test))
        # Score it
        score = keras_model.evaluate(X_test, y_test, verbose=0)
        if neural_network["accuracy"] == 0.:
            neural_network["accuracy"] = score[1]
        # Save it
        histories.append(history)

    return histories
Ejemplo n.º 3
0
def mlp_model(X_train, y_train, X_test, y_test):
    tokenizer = Tokenizer(nb_words=1000)
    nb_classes = np.max(y_train) + 1

    X_train = tokenizer.sequences_to_matrix(X_train, mode="freq")
    X_test = tokenizer.sequences_to_matrix(X_test, mode="freq")

    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)

    print("Building model...")
    model = Sequential()
    model.add(Dense(512, input_shape=(max_len,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode='categorical')

    history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)
    model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)
    # print('Test score:', score[0])
    # print('Test accuracy:', score[1])
    pred_labels = model.predict_classes(X_test)
    # print pred_labels
    # print y_test
    accuracy = accuracy_score(y_test, pred_labels)
    precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted')
    print precision, recall, f1, supp

    return accuracy, precision, recall, f1
Ejemplo n.º 4
0
def running_retuter(modelname):
    maxlen = 400
    max_words = 10000

    # 1. Loading started
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2)

    word_index = reuters.get_word_index(path="reuters_word_index.json")
    num_classes = np.max(y_train) + 1

    # 2. pad_sequences
    keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

    if (modelname == 'cnn'):
       x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen)
       x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen)
       y_train = keras.utils.to_categorical(y_train, num_classes)
       y_test = keras.utils.to_categorical(y_test, num_classes)

    elif(modelname == 'nn'):
        tokenizer = Tokenizer(num_words=max_words)
        x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
        x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

        y_train = keras.utils.to_categorical(y_train, num_classes)
        y_test = keras.utils.to_categorical(y_test, num_classes)

    bulidModel(modelname, num_classes, x_test, y_test, x_train, y_train)
Ejemplo n.º 5
0
def train(model, x_train, y_train, x_test, y_test):
    num_classes = np.max(y_train) + 1

    tokenizer = Tokenizer(num_words=1000)
    x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
    x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    modelcheckpoint_callback = ModelCheckpoint("./best_reuters_model.h5",
                                               monitor='val_loss',
                                               mode='min',
                                               save_best_only=True,
                                               save_weights_only=True)
    history = model.fit(x_train,
                        y_train,
                        batch_size=32,
                        epochs=5,
                        verbose=1,
                        validation_split=0.1,
                        callbacks=[modelcheckpoint_callback])
    score = model.evaluate(x_test, y_test, batch_size=32, verbose=1)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
Ejemplo n.º 6
0
def get_inputs(file_lst, split=0.2):
    all_tweets = []
    i = 0
    for file in file_lst:
        f = np.load(file)
        for t in f:
            all_tweets.append((t, i))
        i += 1
    shuffle(all_tweets)
    X_train, y_train = [], []
    X_test, y_test = [], []
    split_num = int(len(all_tweets) * split)
    for i in range(split_num):
        X_test.append(all_tweets[i][0])
        y_test.append(all_tweets[i][1])
    for i in range(split_num, len(all_tweets)):
        X_train.append(all_tweets[i][0])
        y_train.append(all_tweets[i][1])
        #tokenize data
    tokenizer = Tokenizer(num_words=5000)
    X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
    X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
    Y_train = np_utils.to_categorical(y_train, num_categories)
    Y_test = np_utils.to_categorical(y_test, num_categories)
    return (X_train, Y_train), (X_test, Y_test)
    def _build_data(self):
        """
        data preprocessing & graph input initialization

        args:

            train_dataset: tuple -- (x_train, y_train)
            test_dataset: tuple -- (x_test, y_test)
        """
        # one-hot encode
        tokenizer = Tokenizer(num_words=1000)
        self._x_train = tokenizer.sequences_to_matrix(self._x_train,
                                                      mode="binary")
        self._y_train = keras.utils.to_categorical(self._y_train,
                                                   self.output_dim)
        self._x_test = tokenizer.sequences_to_matrix(self._x_test,
                                                     mode="binary")
        self._y_test = keras.utils.to_categorical(self._y_test,
                                                  self.output_dim)
        self.data_num = self._x_train.shape[0]

        with tf.name_scope("init"):
            self.x = tf.placeholder(tf.float32, shape=(None, 1000), name="x")
            self.y = tf.placeholder(tf.float32, shape=(None, 2), name="y")
            self.global_step = tf.get_variable("global_step",
                                               trainable=False,
                                               initializer=tf.constant(0))
Ejemplo n.º 8
0
def preprocess_features(x_train, x_test, max_words):
    print('Vectorizing sequence data...')
    tokenizer = Tokenizer(num_words=max_words)
    x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
    x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    return x_train, x_test
Ejemplo n.º 9
0
def main():

    tweets = [['Trump is crazy'],
              ['trump is bitching all the asdasda in live'],
              ['Soccer is too slow'], ['Waste time in World Cup rum booze']]
    train_y = np.array([1, 1, 0, 0])
    train_x = [x[0] for x in tweets]
    tokenizer = Tokenizer(num_words=max_words)
    print(train_x)
    tokenizer.fit_on_texts(train_x)
    dictionary = tokenizer.word_index
    print("dictionary: ", dictionary)

    def convert_text_to_index_array(text):
        # one really important thing that `text_to_word_sequence` does
        # is make all texts the same length -- in this case, the length
        # of the longest text in the set.
        result = []
        for word in kpt.text_to_word_sequence(text):
            print("word: ", word)
            x = dictionary.get(word, 0)
            print("x: ", x)
            result.append(x)
        return result
        #return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

    allWordIndices = []
    for text in train_x:
        wordIndices = convert_text_to_index_array(text)
        allWordIndices.append(wordIndices)

    allWordIndices = np.asarray(allWordIndices)
    print("allWord 1: ", allWordIndices)
    train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
    print("train_x", train_x)
    print("type x: ", type(train_x))
    print("type y: ", type(train_y))

    # Sciki Learn
    clf = svm.SVC()
    clf.fit(train_x, train_y)

    pred_tweet = [
        'Trump is live asdasda tu eres juan', 'Trump is asdasda illary',
        'Trump is slow Soccer asdasda'
    ]
    allWordIndices = []
    for text in pred_tweet:
        wordIndices = convert_text_to_index_array(text)
        allWordIndices.append(wordIndices)
    allWordIndices = np.asarray(allWordIndices)
    print("allWord 2: ", allWordIndices)
    pred_X = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
    print("pred X: ", pred_X)
    P = clf.predict(pred_X)
    print("P: ", P)
def prepare(maxlen, dataset_filename='./data/dataset.csv', use_bigram=False):
    # df = pd.read_csv('./data/dataset.csv')
    df = pd.read_csv(dataset_filename)
    X = df['NAME']
    y = df['NATIONALITY']
    num_classes = len(y.unique())

    X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
        X, y, test_size=0.2, random_state=69)

    X_tokenizer = Tokenizer(num_words=None,
                            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                            lower=False,
                            char_level=True,
                            oov_token=None)

    y_tokenizer = Tokenizer(num_words=None,
                            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                            lower=True,
                            char_level=False,
                            oov_token=None)

    X_train = X_train_df.values.astype(
        str
    )  # Otherwise, there's an error when calling 'fit_on_texts' >> AttributeError: 'int' object has no attribute 'lower'
    X_test = X_test_df.values.astype(
        str
    )  # Otherwise, there's an error when calling 'fit_on_texts' >> AttributeError: 'int' object has no attribute 'lower'

    if use_bigram:
        X_train = bigrams(X_train)

    X_tokenizer.fit_on_texts(X_train)
    X_train = X_tokenizer.texts_to_sequences(X_train)
    X_test = X_tokenizer.texts_to_sequences(X_test)

    X_train = X_tokenizer.sequences_to_matrix(X_train, mode='tfidf')
    X_test = X_tokenizer.sequences_to_matrix(X_test, mode='tfidf')

    # encode from string labels to numerical labels
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(
        y_train_df.values.astype(str))  # error without astype(str)
    y_test = label_encoder.transform(y_test_df.values.astype(str))

    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    # pad character sequences to have the same length
    X_train = sequence.pad_sequences(X_train, padding="post", maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, padding="post", maxlen=maxlen)

    max_features = len(X_tokenizer.word_counts)

    return [X_train, y_train, X_test, y_test, max_features, num_classes]
Ejemplo n.º 11
0
def tfidf_process_ci_feats_keras(data,train_data,test_data,num_feats):

    y = train_data['Score']  
    tokenizer = Tokenizer(num_words=num_feats)
    tokenizer.fit_on_texts(data['cutted_Dis'])
    sequences = tokenizer.texts_to_sequences(train_data['cutted_Dis'])
    X= tokenizer.sequences_to_matrix(sequences, mode='tfidf')
    sequences1= tokenizer.texts_to_sequences(test_data['cutted_Dis'])
    test_hh= tokenizer.sequences_to_matrix(sequences1, mode='tfidf')    
    print(X.shape)  
    return X,test_hh,y
Ejemplo n.º 12
0
def main():
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None,
                                                             test_split=0.2)
    word_index = reuters.get_word_index(path="reuters_word_index.json")
    print('# of Training Samples: {}'.format(len(x_train)))
    print('# of Test Samples: {}'.format(len(x_test)))

    num_classes = max(y_train) + 1
    print('# of CLasses: {0}'.format(num_classes))

    max_words = 10000

    tokenizer = Tokenizer(num_words=max_words)
    x_train = tokenizer.sequences_to_matrix(x_train, mode='count')
    x_test = tokenizer.sequences_to_matrix(x_test, mode='count')

    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    print(x_train[0])
    print(len(x_train[0]))
    print(max(x_train[0]))

    print(y_train[0])
    print(len(y_train[0]))

    model = Sequential()
    model.add(Dense(512, input_shape=(max_words, )))
    # model.add(Activation('relu'))
    model.add(Activation('exponential'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.metrics_names)

    batch_size = 32
    epochs = 2

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_split=0.1)
    score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
Ejemplo n.º 13
0
def NNclassify(X_train,X_test,y_train,y_test,inputtype):
	classtype="gender"
	max_words=10000
	batch_size=32
	nb_epoch=20
	if inputtype=='categorical':
		nb_epoch=10
		classtype="age"

	print('Loading data...')
	print(len(X_train), 'train instances')
	print(len(X_test), 'test instances')

	nb_classes = np.max(y_train)+1
	print(nb_classes, 'classes')

	print('Vectorizing sequence data...')
	tokenizer = Tokenizer(nb_words=max_words)
	X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
	X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
	print('X_train shape:', X_train.shape)
	print('X_test shape:', X_test.shape)

	print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
	Y_train = np_utils.to_categorical(y_train, nb_classes)
	Y_test = np_utils.to_categorical(y_test, nb_classes)
	print('Y_train shape:', Y_train.shape)
	print('Y_test shape:', Y_test.shape)

	print('Building model...')
	model = Sequential()

	model.add(MaxoutDense(100, input_shape=(max_words,)))
	model.add(Dropout(0.7))
	model.add(Dense(nb_classes,init='uniform'))
	model.add(Activation('softmax'))

	model.compile(loss='categorical_crossentropy', optimizer='adam',class_mode=inputtype)
	history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)
	score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)
	print('Test score:', score[0])
	print('Test accuracy:', score[1])

	prediction=model.predict(X_test, batch_size=batch_size, verbose=1)
	pred_classes = np.argmax(prediction, axis=1)
	print(Counter(pred_classes))

	results=open('results.txt', 'a')
	results.write("{} \t {} features \t {} epochs \t {} batch size \t {} accuracy \n".format(classtype, max_words, nb_epoch, batch_size,score[1]))
	results.close()

	return pred_classes
Ejemplo n.º 14
0
def get_data(mode='one_hot'):
    """从指定文件中获得待训练数据,数据源文件是txt文件以', '分割
    PARA:
    filename:数据源文件
    mode:返回值的类型,有one_hot与sequence两种
    RETURN:
    分割好的训练集、测验集
    """
    from sklearn.model_selection import train_test_split
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    import pandas as pd
    import numpy as np
    import json
    print("getting data......")
    columns = ['content', 'label']
    content, label = [], []
    with open(
            'D:/instruments_generate/biLstmWithAttention/data/traffic/train.json',
            mode='r',
            encoding='utf8') as fp:
        for line in fp.readlines():
            try:
                data_dict = json.loads(line)
                content.append(data_dict['charge'] + data_dict['defense'] +
                               data_dict['support'])
                label.append(seq2lab(data_dict['result']))
            except:
                pass
    label = to_categorical(np.array(label))
    MAX_LEN = 500
    train_data, test_data, train_label, test_label = train_test_split(
        content, label, test_size=0.1, random_state=42)
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True,
                          split=" ")
    tokenizer.fit_on_texts(content)
    vocab = tokenizer.word_index

    train_data_ids = tokenizer.texts_to_sequences(train_data)
    test_data_ids = tokenizer.texts_to_sequences(test_data)
    if mode == 'one_hot':
        train_data = tokenizer.sequences_to_matrix(train_data_ids,
                                                   mode='binary')
        test_data = tokenizer.sequences_to_matrix(test_data_ids, mode='binary')
    elif mode == 'sequence':
        train_data = pad_sequences(train_data_ids, maxlen=MAX_LEN)
        test_data = pad_sequences(test_data_ids, maxlen=MAX_LEN)
    print("data getted")
    return train_data, test_data, train_label, test_label, vocab
def bag_of_words():
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    num_classes = np.max(y_train) + 1
    print(num_classes, 'classes')
    max_words = 1000
    print('Vectorizing sequence data...')
    tokenizer = Tokenizer(num_words=max_words)
    train = tokenizer.sequences_to_matrix(x_train, mode='count')
    test = tokenizer.sequences_to_matrix(x_test, mode='count')
    print('x_train shape:', train.shape)
    print('x_test shape:', test.shape)
    classify(train, y_train, test, y_test)
Ejemplo n.º 16
0
def quick_dtmize(train_text, test_text, vocab_limit, mode='count'):
    '''vectorize docs w keras Tokenizer API properly with one function call'''
    assert mode in ['binary', 'count', 'freq',
                    'tfidf'], 'supplied `mode` invalid!'
    tokenizer = Tokenizer(num_words=vocab_limit)
    tokenizer.fit_on_texts(train_text)

    train_intseqs = tokenizer.texts_to_sequences(train_text)
    test_intseqs = tokenizer.texts_to_sequences(test_text)

    train_x = tokenizer.sequences_to_matrix(train_intseqs, mode=mode)
    test_x = tokenizer.sequences_to_matrix(test_intseqs, mode=mode)

    return train_x, test_x, tokenizer.word_index
Ejemplo n.º 17
0
def load_data(em0, em1, em2, em3, em4, em5, em6):
    # em0,em1,em2,em3,em4,em5,em6=Domain()
    # words=createVocablulary(em0+em1+em2+em3+em4+em5+em6)
    test = em0[:72] + em1[:
                          72] + em2[:
                                    72] + em3[:
                                              72] + em4[:
                                                        72] + em5[:
                                                                  72] + em6[:
                                                                            72]  # 前90(18)个作为测试样本
    val = em0[-30:] + em1[-30:] + em2[-30:] + em3[-30:] + em4[-30:] + em5[
        -30:] + em6[-30:]  #  验证集 后60个
    em2_new = []
    for i in range(360 - len(em2)):
        em2.append(random.choice(em2[72:-30]))  # re-sampling 扩充到 360

    train=random.sample(em0[72:-30],258)+random.sample(em1[72:-30],258)+random.sample(em2[72:-30],258)+\
          random.sample(em3[72:-30],258)+random.sample(em4[72:-30],258)+random.sample(em5[72:-30],258)+\
          random.sample(em6[72:-30],258)
    words = createVocablulary(train + val)
    train_vec, train_label = createIndex(words, train)
    val_vec, val_label = createIndex(words, val)
    test_vec, test_label = createIndex(words, test)
    X_train = process(train_vec, nb_words=max_features)
    X_val = process(val_vec, nb_words=max_features)
    X_test = process(test_vec, nb_words=max_features)
    # X_train=np.array(X_train)
    # X_val=np.array(X_val)
    # X_test=np.array(X_test)
    print 'X_train:', len(X_train)
    print len(X_train[0])
    print X_train[0]
    # print X_train[0]
    # print len(X_test)
    tokenizer = Tokenizer(nb_words=max_features)
    X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
    print len(X_train[0])
    print X_train[0][:200]
    print X_train.shape
    X_val = tokenizer.sequences_to_matrix(X_val, mode='binary')
    X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
    X_train = np.reshape(X_train, (258 * 7, 1, max_features))
    X_val = np.reshape(X_val, (30 * 7, 1, max_features))
    X_test = np.reshape(X_test, (72 * 7, 1, max_features))
    Y_train = np_utils.to_categorical(train_label, nb_classes)
    Y_test = np_utils.to_categorical(test_label, nb_classes)
    Y_val = np_utils.to_categorical(val_label, nb_classes)
    return X_train, X_test, Y_train, Y_test, X_val, Y_val, test_label
Ejemplo n.º 18
0
def TokenTestGen(parentpath, filename, encoding='gbk'):
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    dataGen = csvToTextGen(parentpath=parentpath,
                           filename=filename,
                           encoding=encoding)
    labelList, maxSegLen = csvToLabelAndDataMaxLen(parentpath=parentpath,
                                                   filename=filename,
                                                   encoding=encoding)

    # tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataGen)
    dataGenList = csvToTextGen(parentpath=parentpath,
                               filename=filename,
                               encoding=encoding)
    sequences = tokenizer.texts_to_sequences(dataGenList)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    if modelname == 'mlp':
        data = tokenizer.sequences_to_matrix(sequences, mode='tfidf')
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(labelList, num_classes=LABEL_CLASS)

    # print("data:",data)
    # print("labels:",labels)

    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    return maxSegLen, word_index, labels, data
Ejemplo n.º 19
0
def test_model(model, sent):
    K.clear_session()
    script_dir = os.path.dirname(os.path.realpath('__file__'))
    fileh = open(
        os.path.join(script_dir, 'data/nlp_models/{}_vocab.obj'.format(model)),
        'rb')
    lst_orth, lst_orth_dict, lst_labels, lst_labels_dict, lst_zero_label, lst_labels_dhae2 = pickle.load(
        fileh)
    model = load_model(
        os.path.join(script_dir, 'data/nlp_models/{}.h5'.format(model)))
    result = []
    txt = nlp(sent)
    tokens_lst = []
    for ent in txt.ents:
        print(ent)
        tokens, lemmas, pos_tags, shapes = extract_verbs_from_entity(
            ent, lst_orth, lst_orth_dict, add=False)
        if len(tokens) > 0:
            tokens_lst.append(tokens)
    x_matrix2 = np.array(tokens_lst)
    print(x_matrix2)
    tokenizer = Tokenizer(num_words=len(lst_orth))
    x_matrix3 = tokenizer.sequences_to_matrix(tokens_lst, mode='binary')
    zz = model.predict(x_matrix3, batch_size=32, verbose=1)
    for idx1, z in enumerate(zz):
        for idx, x in enumerate(zz[idx1]):
            v_id = '-'
            for k in lst_labels_dict.keys():
                if lst_labels_dict[k] == idx:
                    v_id = VocabsBaseClass.objects.get(id=k).name
            result.append((str(txt.ents[idx1]), idx, v_id, x))
    return result
Ejemplo n.º 20
0
def get_data(
        filename='D:/judgement_prediction/judgement_prediction/temp/data.txt',
        mode='one_hot'):
    """从指定文件中获得待训练数据,数据源文件是txt文件以', '分割
    PARA:
    filename:数据源文件
    mode:返回值的类型,有one_hot与sequence两种
    RETURN:
    分割好的训练集、测验集
    """
    from sklearn.model_selection import train_test_split
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    import pandas as pd
    import numpy as np
    print("getting data......")
    columns = ['content', 'label']
    data = pd.read_csv(filename,
                       encoding='utf-8',
                       sep=', ',
                       header=None,
                       names=columns,
                       engine='python')
    data.reindex(np.random.permutation(data.index))
    content = data['content']
    label = to_categorical(np.array(data['label']))
    MAX_LEN = 200
    train_data, test_data, train_label, test_label = train_test_split(
        content, label, test_size=0.1, random_state=42)
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True,
                          split=" ")
    tokenizer.fit_on_texts(content)
    vocab = tokenizer.word_index

    train_data_ids = tokenizer.texts_to_sequences(train_data)
    test_data_ids = tokenizer.texts_to_sequences(test_data)
    if mode == 'one_hot':
        train_data = tokenizer.sequences_to_matrix(train_data_ids,
                                                   mode='binary')
        test_data = tokenizer.sequences_to_matrix(test_data_ids, mode='binary')
    elif mode == 'sequence':
        train_data = pad_sequences(train_data_ids, maxlen=MAX_LEN)
        test_data = pad_sequences(test_data_ids, maxlen=MAX_LEN)
    print("data getted")
    return train_data, test_data, train_label, test_label, vocab
Ejemplo n.º 21
0
def preproc_for_sklearn(X, y, nb_features):
    try:
        tokenizer = Tokenizer(num_words=nb_features)
    except:
        tokenizer = Tokenizer(num_words=nb_features)
    X = tokenizer.sequences_to_matrix(X, mode='binary')

    return X, y
Ejemplo n.º 22
0
def preproc_for_sklearn(X, y, nb_features):
    try:
        tokenizer = Tokenizer(num_words=nb_features)
    except:
        tokenizer = Tokenizer(num_words=nb_features)
    X = tokenizer.sequences_to_matrix(X, mode='binary')

    return X, y
Ejemplo n.º 23
0
def load_vect_mat():
    """The main method for loading, vectorizing, matrix-forming the newswire (labeled train & test) data
	to be fed into the Keras functional model API .fit and .evaluate functions.

	Arguments
	---------
	none

	Returns
	-------
	ttPair -- The usual pair of (X, Y)-train and that for test  (tuple/pair of tuples/pairs)
	"""

    print('\nLoading data...')
    (X_train, y_train), (X_test,
                         y_test) = reuters.load_data(nb_words=max_words,
                                                     test_split=0.2)
    print(len(X_train), 'train sequences  Be like:')
    print(X_train[0])
    print(len(X_test), 'test sequences  Be like:')
    print(X_test[0])

    global nb_classes
    nb_classes = np.max(y_train) + 1
    print(nb_classes, 'topic classes')

    print('\nVectorizing (1/0) sequence data...')
    tokenizer = Tokenizer(nb_words=max_words)
    X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
    X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print(
        '\nConvert the list of (integer) class labels to one hotshot! -- 1/0 "row-wise" topic matrix (for use with categorical_crossentropy)'
    )
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    print(y_train[0], ' --> ', Y_train[0])
    print('... --> ...')
    print(y_train[-1], ' --> ', Y_train[-1])
    Y_test = np_utils.to_categorical(y_test, nb_classes)
    print('Y_train shape:', Y_train.shape)
    print('Y_test shape:', Y_test.shape)
    ttPair = ((X_train, Y_train), (X_test, Y_test))
    return ttPair
Ejemplo n.º 24
0
def runExperiment(xTrain, yTrain, xTest, yTest, outFile):
    numClasses = np.max(yTrain) + 1
    tokenizer = Tokenizer(num_words=MAXWORDS)
    xTrain = tokenizer.sequences_to_matrix(xTrain, mode='binary')
    xTest = tokenizer.sequences_to_matrix(xTest, mode='binary')
    yTrain = keras.utils.to_categorical(yTrain, numClasses)
    yTest = keras.utils.to_categorical(yTest, numClasses)
    model = Sequential()
    model.add(Dense(HIDDENLAYER1, input_shape=(MAXWORDS, )))
    model.add(Activation('relu'))
    model.add(Dense(HIDDENLAYER2))
    model.add(Activation('sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(numClasses))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(xTrain,
                        yTrain,
                        batch_size=BATCHSIZE,
                        epochs=EPOCHS,
                        verbose=VERBOSE,
                        validation_split=VALIDATIONSPLIT)
    predictions = model.predict(xTest, batch_size=BATCHSIZE, verbose=VERBOSE)
    labelsN = []
    predictionsN = []
    for i in range(0, len(predictions)):
        maxJ = -1
        maxP = 0
        for j in range(0, len(predictions[i])):
            if predictions[i][j] > maxP:
                maxP = predictions[i][j]
                maxJ = j
        maxYJ = -1
        maxY = 0
        for j in range(0, len(yTest[i])):
            if yTest[i][j] > maxY:
                maxY = yTest[i][j]
                maxYJ = j
        labelsN.append(maxJ)
        predictionsN.append(maxYJ)
        print(maxYJ, maxJ, file=outFile)
    score = metrics.accuracy_score(labelsN, predictionsN)
    return (score, labelsN, predictionsN)
def train_model():
    max_words = 500
    data = pd.read_csv("data.csv", sep='\t', skipinitialspace=True)
    train_x = [x[1] for x in data.values[:1000]]
    # index all the sentiment labels
    train_y = np.asarray([x[0] for x in data.values[:1000]])

    tokenizer = Tokenizer(num_words=max_words)
    # feed tweets to the Tokenizer
    tokenizer.fit_on_texts(train_x)

    # Tokenizers come with a convenient list of words and IDs
    dictionary = tokenizer.word_index

    # Let's save this out so we can use it later
    with open('dictionary1.json', 'w') as dictionary_file:
        json.dump(dictionary, dictionary_file)

    allWordIndices = []
    # for each tweet, change each token to its ID in the Tokenizer's word_index
    for text in train_x:
        wordIndices = convert_text_to_index_array(text, dictionary)
        allWordIndices.append(wordIndices)

    # now we have a list of all tweets converted to index arrays.
    # cast as an array for future usage.
    allWordIndices = np.asarray(allWordIndices)

    # create one-hot matrices out of the indexed tweets
    train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
    # treat the labels as categories
    train_y = keras.utils.to_categorical(train_y, 2)

    model = Sequential()
    model.add(Dense(512, input_shape=(max_words, ), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(train_x,
              train_y,
              batch_size=32,
              epochs=5,
              verbose=1,
              validation_split=0.1,
              shuffle=True)

    model_json = model.to_json()
    with open('model1.json', 'w') as json_file:
        json_file.write(model_json)

    model.save_weights('model1.h5')
Ejemplo n.º 26
0
def processData():
    """ Pre-process the Reuters data. """

    (x_train, y_train), (x_test,
                         y_test) = reuters.load_data(num_words=max_words,
                                                     test_split=0.2)

    # Tokenize the data
    tokenizer = Tokenizer(num_words=max_words)
    x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
    x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

    # Convert class vector to binary class matrix
    num_classes = np.max(y_train) + 1
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    return x_train, y_train, x_test, y_test, num_classes
def preprocess_keras():
    # 划分训练/测试集
    train = pd.read_csv("data/long_train.csv")
    new_train = train.rename(columns={'class': 'article_class'}, inplace=False)
    #y_train = pd.get_dummies(new_train['article_class'])
    y = new_train.article_class.values
    x_text = new_train.word_seg.values
    X_train, X_test, y_train, y_test = train_test_split(x_text,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)

    # 对类别变量进行编码,共10类
    y_train = pd.Series(y_train)
    y_test = pd.Series(y_test)
    y_labels = list(y_train.value_counts().index)
    le = pr.LabelEncoder()
    le.fit(y_labels)
    num_labels = len(y_labels)
    y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]),
                             num_labels)
    y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]),
                            num_labels)

    # 分词,构建单词-id词典
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True,
                          split=" ")
    tokenizer.fit_on_texts(x_text)
    vocab = tokenizer.word_index

    # 将每个词用词典中的数值代替
    X_train_word_ids = tokenizer.texts_to_sequences(X_train)
    X_test_word_ids = tokenizer.texts_to_sequences(X_test)

    # One-hot
    x_train_o = tokenizer.sequences_to_matrix(X_train_word_ids, mode='binary')
    x_test_o = tokenizer.sequences_to_matrix(X_test_word_ids, mode='binary')

    # 序列模式
    x_train_p = pad_sequences(X_train_word_ids, maxlen=20)
    x_test_p = pad_sequences(X_test_word_ids, maxlen=20)

    return x_train_o, y_train, vocab, x_test_o, y_test
Ejemplo n.º 28
0
    def process(self,
                json_filename=None,
                h5_filename=None,
                plot=False,
                epochs=100):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []
        with open(self.labeled_tweets_filename, "r",
                  encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        print("len(All): ", len(All))
        #randoming shuffle all tweets
        np.random.shuffle(All)

        ones_count = 0
        for r in All:
            tweet = r[0].strip()
            label = int(r[1])
            X_all.append(tweet)
            Y_all.append(label)

        print("Data Ingested")
        print("X_all[0]: ", X_all[0])
        tokenizer = Tokenizer(num_words=max_words, oov_token='unk')
        print("Fitting data")
        tokenizer.fit_on_texts(X_all)
        X_Seq_All = tokenizer.texts_to_sequences(X_all)

        print("X_Seq_All[0]", X_Seq_All[0])
        print("Final Conversion")
        X_Train = tokenizer.sequences_to_matrix(X_Seq_All, mode='binary')
        print("train_x[0]", X_Train[0])
        Y_Train = Y_all
        print("Create Model")
        model = Sequential()
        model.add(Dense(1, input_dim=10000))
        model.add(Activation('sigmoid'))
        model.summary()
        print("Compilation")
        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        history = model.fit(X_Train,
                            Y_Train,
                            epochs=epochs,
                            validation_split=0.20)
        print("Done")
Ejemplo n.º 29
0
def preprocessing(X_train, X_test, Y_train, Y_test, num_classes):
    print('Before convert of sequence words to binary matrix...')
    print('X_train shape:', np.shape(X_train))
    print('X_test shape:', np.shape(X_test))

    print('Convert sequences of words (index) to binary matrix')
    tokenizer = Tokenizer(num_words=MAX_WORDS)
    # Return: numpy array of shape (len(sequences), num_words).
    X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
    X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print('Convert class label (integers vector) to binary class matrix')
    Y_train = keras.utils.to_categorical(Y_train, num_classes)
    Y_test = keras.utils.to_categorical(Y_test, num_classes)
    print('Y_train shape:', Y_train.shape)
    print('Y_test shape:', Y_test.shape)
    return X_train, X_test, Y_train, Y_test
Ejemplo n.º 30
0
def get_reuters_dataset(batch_size, max_words):
    (X_train, y_train), (X_test,
                         y_test) = reuters.load_data(nb_words=max_words,
                                                     test_split=0.2)
    nb_classes = np.max(y_train) + 1
    tokenizer = Tokenizer(nb_words=max_words)
    X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
    X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
    y_train = np_utils.to_categorical(y_train, nb_classes)
    y_test = np_utils.to_categorical(y_test, nb_classes)

    batch_iterator = SimpleBatchIterator(X_train,
                                         y_train,
                                         batch_size,
                                         autoloop=True)
    test_batch_iterator = SimpleBatchIterator(X_test,
                                              y_test,
                                              len(X_test),
                                              autoloop=True)
    return batch_iterator, test_batch_iterator, nb_classes
Ejemplo n.º 31
0
    def __split_data(self,mode,MAX_LEN):
        from sklearn.model_selection import train_test_split
        from keras.preprocessing.text import Tokenizer
        from keras.preprocessing.sequence import pad_sequences
        content, label=self.__read_data()
        train_data, test_data, train_label, test_label = train_test_split(content, label,
                                                                      test_size=0.1, random_state=42)
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
        tokenizer.fit_on_texts(content)
        vocab = tokenizer.word_index

        train_data_ids = tokenizer.texts_to_sequences(train_data)
        test_data_ids = tokenizer.texts_to_sequences(test_data)
        if mode=='one_hot':
            train_data = tokenizer.sequences_to_matrix(train_data_ids, mode='binary')
            test_data = tokenizer.sequences_to_matrix(test_data_ids, mode='binary')
        elif mode=='sequence':
            train_data = pad_sequences(train_data_ids, maxlen=MAX_LEN)
            test_data = pad_sequences(test_data_ids, maxlen=MAX_LEN)
        return train_data, test_data, train_label, test_label, vocab
Ejemplo n.º 32
0
    def preprocess(self, X_train, y_train, X_val, y_val):
        X_train_headline, X_train_article = X_train
        X_val_headline, X_val_article = X_val

        if self.get_tokenizer() is None:
            tokenizer = Tokenizer(num_words=self.config['vocabulary_dim'])
            self.set_tokenizer(tokenizer)
        tokenizer = self.get_tokenizer()
        tokenizer.fit_on_texts(X_train_headline + X_train_article)
        X_train_headline = tokenizer.texts_to_sequences(X_train_headline)
        X_train_article = tokenizer.texts_to_sequences(X_train_article)
        X_val_headline = tokenizer.texts_to_sequences(X_val_headline)
        X_val_article = tokenizer.texts_to_sequences(X_val_article)

        X_train_headline = tokenizer.sequences_to_matrix(
            X_train_headline, mode=self.config['matrix_mode'])
        X_train_article = tokenizer.sequences_to_matrix(
            X_train_article, mode=self.config['matrix_mode'])
        X_val_headline = tokenizer.sequences_to_matrix(
            X_val_headline, mode=self.config['matrix_mode'])
        X_val_article = tokenizer.sequences_to_matrix(
            X_val_article, mode=self.config['matrix_mode'])

        y_train_stance = np_utils.to_categorical(y_train)
        y_train_related = np_utils.to_categorical(collapse_stances(y_train))
        y_val_stance = np_utils.to_categorical(y_val)
        y_val_related = np_utils.to_categorical(collapse_stances(y_val))

        return ({
            'headline_input': X_train_headline,
            'article_input': X_train_article,
        }, {
            'related_prediction': y_train_related,
            'stance_prediction': y_train_stance,
        }, {
            'headline_input': X_val_headline,
            'article_input': X_val_article,
        }, {
            'related_prediction': y_val_related,
            'stance_prediction': y_val_stance,
        })
Ejemplo n.º 33
0
def tokenize(dic, data):
    # create a tokenizer and feed in word index
    t = Tokenizer(num_words=None, lower=True, split=' ')
    t.word_index = dic
    # convert words from each call transcription into an index array
    allWords = []
    transcriptions = data['Words']
    for text in transcriptions:
        words = convert_text_to_index_array(text, dic)
        allWords.append(words)
    # convert index array into a matrix and return it
    return t.sequences_to_matrix(allWords, mode='binary')
def run_keras_example():
	max_words = 1000
	batch_size = 32
	nb_epoch = 5

	print('Loading data...')
	(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)
	print(len(X_train), 'train sequences')
	print(len(X_test), 'test sequences')

	nb_classes = np.max(y_train)+1
	print(nb_classes, 'classes')

	print('Vectorizing sequence data...')
	tokenizer = Tokenizer(nb_words=max_words)
	X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
	X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
	print('X_train shape:', X_train.shape)
	print('X_test shape:', X_test.shape)

	print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
	Y_train = np_utils.to_categorical(y_train, nb_classes)
	Y_test = np_utils.to_categorical(y_test, nb_classes)
	print('Y_train shape:', Y_train.shape)
	print('Y_test shape:', Y_test.shape)

	print('Building model...')
	model = Sequential()
	model.add(Dense(512, input_shape=(max_words,)))
	model.add(Activation('tanh'))
	model.add(Dropout(0.5))
	model.add(Dense(nb_classes))
	model.add(Activation('softmax'))

	model.compile(loss='categorical_crossentropy', optimizer='adam')

	history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)
	score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)
	print('Test score:', score[0])
	print('Test accuracy:', score[1])
Ejemplo n.º 35
0
'''

max_words = 10000
batch_size = 16

print "Loading data..."
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)
print len(X_train), 'train sequences'
print len(X_test), 'test sequences'

nb_classes = np.max(y_train)+1
print nb_classes, 'classes'

print "Vectorizing sequence data..."
tokenizer = Tokenizer(nb_words=max_words)
X_train = tokenizer.sequences_to_matrix(X_train, mode="binary")
X_test = tokenizer.sequences_to_matrix(X_test, mode="binary")
print 'X_train shape:', X_train.shape
print 'X_test shape:', X_test.shape

print "Convert class vector to binary class matrix (for use with categorical_crossentropy)"
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
print 'Y_train shape:', Y_train.shape
print 'Y_test shape:', Y_test.shape

print "Building model..."
model = Sequential()
model.add(Dense(max_words, 256, init='normal'))
model.add(Activation('relu'))
model.add(BatchNormalization(input_shape=(256,))) # try without batch normalization (doesn't work as well!)
Ejemplo n.º 36
0
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed tweets
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
# treat the labels as categories
train_y = keras.utils.to_categorical(train_y, 2)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
    optimizer='adam',
print (y_train_cat.shape, y_test_cat.shape)


# In[8]:

nb_classes = np.max(encoded_Y_train)+1
print(nb_classes, 'classes')


# In[9]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_num = tokenizer.texts_to_sequences(X_train)
X_test_num = tokenizer.texts_to_sequences(X_test)
X_train_mat = tokenizer.sequences_to_matrix(X_train_num)
X_test_mat = tokenizer.sequences_to_matrix(X_test_num)


# In[10]:

print('X_train shape:', X_train_mat.shape)
print('X_test shape:', X_test_mat.shape)


# In[36]:

batch_size = 100
nb_epoch = 50

Ejemplo n.º 38
0
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices

# read in your saved model structure
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
# and create a model from that
model = model_from_json(loaded_model_json)
# and weight your nodes with your saved values
model.load_weights('model.h5')

# okay here's the interactive part
while 1:
    evalSentence = raw_input('Input a sentence to be evaluated, or Enter to quit: ')

    if len(evalSentence) == 0:
        break

    # format your input for the neural net
    testArr = convert_text_to_index_array(evalSentence)
    input = tokenizer.sequences_to_matrix([testArr], mode='binary')
    # predict which bucket your input belongs in
    pred = model.predict(input)
    # and print it for the humons
    print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))
	y_train, y_valid, y_test = data[1], data[3], data[5]
	#X_train, X_valid, X_test = data[0], data[2], data[4]

	vec = CountVectorizer()
	X_train = vec.fit_transform([' '.join(l) for l in data[0]])
	X_valid = vec.transform([' '.join(l) for l in data[2]])
	X_test = vec.transform([' '.join(l) for l in data[4]])

	tokenizer = Tokenizer()
	tokenizer.fit_on_texts([' '.join(l) for l in data[0]])

	X_train_keras = tokenizer.texts_to_sequences([' '.join(l) for l in data[0]])
	X_test_keras = tokenizer.texts_to_sequences([' '.join(l) for l in data[4]])
	X_valid_keras = tokenizer.texts_to_sequences([' '.join(l) for l in data[2]])
	X_train_keras = tokenizer.sequences_to_matrix(X_train_keras)
	X_test_keras = tokenizer.sequences_to_matrix(X_test_keras)
	X_valid_keras = tokenizer.sequences_to_matrix(X_valid_keras)

	n_classes = np.max(y_train) + 1

	Y_train = np_utils.to_categorical(y_train, n_classes)
	Y_test = np_utils.to_categorical(y_test, n_classes)
	Y_valid = np_utils.to_categorical(y_valid, n_classes)

	print('KERAS...')
	### MLP
	model = Sequential()
	model.add(Dense(output_dim=2048, input_dim=X_test_keras.shape[1], init='glorot_normal', W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
	model.add(Activation('tanh'))
	model.add(Dense(output_dim=256, input_dim=2048, init='glorot_normal', W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))
Ejemplo n.º 40
0
print('Fitting text on tokenizer...')
tokenizer = Tokenizer(nb_words=max_words)
tokenizer.fit_on_texts(X)

# Split the data
print('Split text into train and test...')

split_point = int(len(X) * 0.90)
X_train, X_test = X[:split_point], X[split_point:]
y_train, y_test = y[:split_point], y[split_point:]

print('Text to sequence - sequence to matrix for data ...')
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = tokenizer.sequences_to_matrix(X_train)
X_test = tokenizer.sequences_to_matrix(X_test)

nb_classes = np.max(y_train)+1
y_train = np_utils.to_categorical(y_train, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)


# Pad input sequences
input_size = len(max(X_train, key=len))

X_train = sequence.pad_sequences(X_train, maxlen=input_size)
X_test = sequence.pad_sequences(X_test, maxlen=input_size)

# Setting some parameters
batch_size = 20
Ejemplo n.º 41
0
end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

# run simple linear regression to compare performance

#based on grid search done by:
#https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb

#the tfidf vectors capture co-occurance statistics, think of each number representing how many times
#a word occured in a text and scaled by word frequency

tfidfTokenizer = Tokenizer(nb_words=max_features)
tfidfTokenizer.fit_on_sequences(X_train.tolist())
X_train_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_train.tolist(), mode="tfidf"))
X_test_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_test.tolist(), mode="tfidf"))

#check tfidf matrix
print(X_train_tfidf)
print(X_train_tfidf.shape, X_test_tfidf.shape)

from sklearn.linear_model import LogisticRegression

model_tfidf_reg = LogisticRegression(random_state=0, C=0.001, penalty='l2', verbose=1)
model_tfidf_reg.fit(X_train_tfidf, y_train)

from sklearn.metrics import accuracy_score
#calculate test and train accuracy
print("train acc:", accuracy_score(y_test, model_tfidf_reg.predict(X_train_tfidf)))
print("test acc:", accuracy_score(y_test, model_tfidf_reg.predict(X_test_tfidf)))
Ejemplo n.º 42
0
# In[22]:


print(x_train[0])
print(y_train[0])


# ## 3. One-hot encoding the output
# Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1.

# In[23]:


# One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])


# And we'll also one-hot encode the output.

# In[24]:


# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)
Ejemplo n.º 43
0
def process_X_data(X, nb_features):
    assert nb_features > 0

    tokenizer = Tokenizer(num_words=nb_features)
    return tokenizer.sequences_to_matrix(X, mode='binary')