def load_test_dataset(self, filename='testDatasetWithOutNeuTwo', vocab_name='vocabProc'): """ Something is wrong with this function. Does not get the same result as before when loading in the new data... """ """ Load test dataset """ self.test_x, self.test_y = load_csv('datasets/' + filename, target_column=1) """ Get restored vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(self.test_y) self.test_y = labelEncoder.transform(self.test_y) """ Change the list of sentences to a list of sequence of words """ self.test_x = np.array( list(self.vocabProcessor.transform([x[0] for x in self.test_x]))) """ Pad the sequences to fit the longest sentence """ self.test_x = pad_sequences(self.test_x, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.test_y = to_categorical(self.test_y, nb_classes=self.number_of_classes)
def load_dataset_training(self, vocab_name, filename='datasetWithoutNeutral'): """ Load the dataset """ X, Y = load_csv('datasets/' + filename, target_column=2, columns_to_ignore=[0]) """ Count max words from the longest sentence """ self.max_words = max([len(x[0].split(" ")) for x in X]) """ Get vocabulare size from longest sentence """ self.vocabProcessor = VocabularyProcessor(self.max_words) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(Y) Y = labelEncoder.transform(Y) """ Change the list of sentences to a list of sequence of words """ X = np.array(list(self.vocabProcessor.fit_transform([x[0] for x in X]))) """ Split the datasets to training set and test test """ self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split( X, Y, test_size=0.10, random_state=7) """ Pad the sequences to fit the longest sentence """ self.X_train = pad_sequences(self.X_train, maxlen=self.max_words, value=0.) self.X_test = pad_sequences(self.X_test, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.Y_train = to_categorical(self.Y_train, nb_classes=self.number_of_classes) self.Y_test = to_categorical(self.Y_test, nb_classes=self.number_of_classes) self.vocabProcessor.save(vocab_name)
def getData(dataframe): # Extract the required columns for inputs and outputs totalX = dataframe.sub_grade totalY = dataframe.short_emp print(dataframe.sub_grade.value_counts()) totalX = totalX.values totalY = totalY.values vocab_proc = VocabularyProcessor(1) # Convert the strings in the input into integers corresponding to the dictionary positions # Data is automatically padded so we need to pad_sequences manually """ totalX="I am happy today. I feel sad today." vocab_proc = VocabularyProcessor(1) totalX = np.array(list(vocab_proc.fit_transform(totalX))) #totalX=totalX.transpose() #totalX = pad_sequences(totalX, maxlen=100, value=0.) print(totalX) """ totalX = np.array(list(vocab_proc.fit_transform(totalX))) totalY = to_categorical(totalY, nb_classes=2) #totalY=[1,0,1,0,1] 需要为数组 # 分离出训练数据和测试数据 trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1) return trainX, testX, trainY, testY
def __init__(self, config): self.config = config # load data here # Load data from files positive_data_file = self.config.positive_data_file negative_data_file = self.config.negative_data_file positive_examples = list(open(positive_data_file, "r").readlines()) positive_examples = [s.strip() for s in positive_examples] negative_examples = list(open(negative_data_file, "r").readlines()) negative_examples = [s.strip() for s in negative_examples] # Split by words x_text = positive_examples + negative_examples x_text = [clean_str(sent) for sent in x_text] self.max_text_length = max([len(x.split(" ")) for x in x_text]) # transform words to id vocab_processor = VocabularyProcessor(self.max_text_length) X = np.array(list(vocab_processor.fit_transform(x_text))) self.vocab_size = len(vocab_processor.vocabulary_) # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) self.train_input, self.valid_input, self.train_label, self.valid_label = train_test_split( X, y, random_state=2018, test_size=0.1) print('training size: {}'.format(len(self.train_input)))
def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] if char_embeddings: log('Chosen char embeddings.') self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)] else: log('Chosen word embeddings.') self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)] max_sentence_length = max(self.sentences_lengths) log('Maximum sentence length : {}'.format(max_sentence_length)) if char_embeddings: log('Processing sentences with char embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, tokenizer_fn=char_tokenizer, ) else: log('Processing sentences with word embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, ) log('Sentences have been successfully processed.') self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir))
def load_vocab_processor(name,max_length,min_frequency): ''' load model ''' print('Loading vocabulary model from {}'.format(name)) vp = VocabularyProcessor(max_length, min_frequency=min_frequency) vp = vp.restore(name) return vp
def do_vocabulary_table(): x, y = load_data() vp = VocabularyProcessor(max_document_length=max_document_length, min_frequency=0) x = vp.fit_transform(x, unused_y=None) x = np.array(list(x)) return x, y
def __init__(self, raw_sentence, model_dir, save_vocab=True): os.makedirs(model_dir, exist_ok=True) raw_sentence = raw_sentence.ravel() raw_sentence = [str(x) for x in list(raw_sentence)] self.sentence_length = [ len(str(x).split(' ')) for x in list(raw_sentence) ] max_sentence_length = max(self.sentence_length) self.vocabulary = VocabularyProcessor(max_sentence_length) if save_vocab: self.vocabulary.save('{}/vocab'.format(model_dir))
def make_vocab_processor(name,text,max_length,min_frequency): '''' generate vocab model ''' print('Making vocabulary model...') vp = VocabularyProcessor(max_length, min_frequency=min_frequency) vp = vp.fit(text) if name == None: return vp else: print('Saving vocabulary model to {}'.format(name)) vp.save(name) return vp
def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'): """ Load vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Transorm sentence to matrix of numbers """ sentence = np.array( list(self.vocabProcessor.transform([x[0] for x in sentence]))) sentence = pad_sequences( sentence, max_len=self.vocabProcessor.max_document_length, value=0.) """ Predict sentence """ pred_score = self.model.predict(sentence) return pred_score
def _create_vocab(min_count=3): """ 创建词汇表 需要训练样本 """ def gen_documents(): for path in (TRAIN_POS_PATH, TRAIN_NEG_PATH): with codecs.open(path, 'r', 'utf-8') as file: for line in file: yield line[:-1] vocab = VocabularyProcessor(SEQUENCE_LEN, min_count - 1, tokenizer_fn=chinese_tokenizer) vocab.fit(gen_documents()) return vocab
def build_model(vocabFile, model_type='bilstm'): processor = VocabularyProcessor.restore(vocabFile) n_words = len(processor.vocabulary_) net = tflearn.input_data([None, 300]) net = tflearn.embedding(net, input_dim=n_words, output_dim=200) if model_type == 'bilstm': net = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(200), tflearn.BasicLSTMCell(200)) net = dropout(net, 0.5) elif model_type == 'lstm': net = tflearn.lstm(net, 200, dropout=0.5) net = dropout(net, 0.5) elif model_type == 'cnn': net = conv_model(net) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.05, loss='categorical_crossentropy') return net
def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] self.sentences_lengths = [ len(str(x).split(' ')) for x in list(raw_sentence_pairs) ] max_sentence_length = max(self.sentences_lengths) self.vocabulary = VocabularyProcessor(max_sentence_length) self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir))
class SentimentLookup: net = tflearn.input_data ([None, 40]) net = tflearn.embedding (net, input_dim=12495, output_dim=128) net = tflearn.lstm (net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression (net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') model = tflearn.DNN(net, tensorboard_verbose=0) model.load(MODEL) vp = VocabularyProcessor.restore(VOCAB) def _process_tweet(self, tweet = ""): cleaned = str(tweet).upper() cleaned = re.sub('&\w+;', '', cleaned) cleaned = re.sub('\'', '', cleaned) cleaned = re.sub('@\w+ ', 'USERNAME ', cleaned) cleaned = re.sub('[^A-Z ]', '', cleaned) cleaned = re.sub('[ ]+', ' ', cleaned) return cleaned.strip() def sentiment(self, data): if isinstance(data, str): query = [x for x in SentimentLookup.vp.transform([self._process_tweet(data)])] bad, good = SentimentLookup.model.predict(query).tolist()[0] return good data = map(self._process_tweet, data) query = [x for x in SentimentLookup.vp.transform(data)] return SentimentLookup.model.predict(query)[:,1]
class DatasetVectorizer: def __init__(self, raw_sentence, model_dir, save_vocab=True): os.makedirs(model_dir, exist_ok=True) raw_sentence = raw_sentence.ravel() raw_sentence = [str(x) for x in list(raw_sentence)] self.sentence_length = [ len(str(x).split(' ')) for x in list(raw_sentence) ] max_sentence_length = max(self.sentence_length) self.vocabulary = VocabularyProcessor(max_sentence_length) if save_vocab: self.vocabulary.save('{}/vocab'.format(model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format( self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence): #num_instances, num_classes = raw_sentence.shape num_instances = raw_sentence.shape[0] num_classes = 1 raw_sentence = raw_sentence.ravel() for i, v in enumerate(raw_sentence): if v is np.nan: print(i, v) vectorized_sentence = np.array( list(self.vocabulary.transform(raw_sentence))) vectorized_sentence = vectorized_sentence.reshape( num_instances, num_classes, self.max_sentence_len) return vectorized_sentence[:, 0, :]
def __init__(self): if not exists(VOCABULARY_PATH): self._vocab = self._create_vocab() self._vocab.save(VOCABULARY_PATH) else: self._vocab = VocabularyProcessor.restore(VOCABULARY_PATH) self._model = self._create_model() if exists(MODEL_PATH + '.meta'): self._model.load(MODEL_PATH, True)
def __init__(self, args): self.use_gaze = args.use_gaze categories = ['NEGATIVE', 'POSITIVE'] if args.num_sentiments == 3: categories.append('NEUTRAL') dataset = load_files(container_path=DATASET_DIR, categories=categories, load_content=True, encoding='utf-8') self.sentences_, self.sentence_numbers, _, self.targets, _ = dataset.values( ) self.sentence_numbers = [ int(re.search(r'\d{1,3}', fname).group()) for fname in dataset['filenames'] ] # adopting Hollenstein's method of building the vocab self.sentences = [clean_str(s) for s in self.sentences_] self.num_sentences = len(self.sentences) self.max_sentence_length = max( [len(s.split()) for s in self.sentences]) self.vocab_processor = VocabularyProcessor(self.max_sentence_length) self.indexed_sentences = torch.LongTensor( list(self.vocab_processor.fit_transform(self.sentences))) self.vocabulary = list( self.vocab_processor.vocabulary_._mapping.keys()) print('\n> Data set loaded. Sentiment classes:', categories) print('> Max sentence length:', self.max_sentence_length, 'words.') if self.use_gaze: self.et_features = EyeTrackingFeatures(self.max_sentence_length) print('> Loaded eye-tracking features.\n') else: self.et_features = np.zeros(self.num_sentences)
def build_vocabulary(inFile, dtype, vocabFile): # 文本长度 200 MAX_LENGTH = 300 NB_CLASSES = 2 # 读入分词后文本 doc = MySentences(inFile, dtype, 'get_content') # 把原始文本映射到index processor = VocabularyProcessor(MAX_LENGTH, min_frequency=5) processor.fit(doc) processor.save(vocabFile)
class DatasetVectorizer: def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] self.sentences_lengths = [ len(str(x).split(' ')) for x in list(raw_sentence_pairs) ] max_sentence_length = max(self.sentences_lengths) self.vocabulary = VocabularyProcessor(max_sentence_length) self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format( self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence_pairs): num_instances, num_classes = raw_sentence_pairs.shape raw_sentence_pairs = raw_sentence_pairs.ravel() for i, v in enumerate(raw_sentence_pairs): if v is np.nan: print(i, v) vectorized_sentence_pairs = np.array( list(self.vocabulary.transform(raw_sentence_pairs))) vectorized_sentence_pairs = vectorized_sentence_pairs.reshape( num_instances, num_classes, self.max_sentence_len) vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :] vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :] return vectorized_sentence1, vectorized_sentence2
def text2npy(inFile, outFile, vocabFile, dtype): processor = VocabularyProcessor.restore(vocabFile) doc = MySentences(inFile, dtype, 'get_content') train_doc = list(processor.transform(doc)) # 可以使用 to_categorical 来实现onehot编码 # to_categorical(np.array(lable), NB_CLASSES)) if dtype == 'train': # 把标签做变换 lable = [] for y in MySentences(inFile, dtype, 'get_info'): lable.append(int(y)) y = np.array(lable) # 保存到文件 np.save(outFile, np.column_stack([train_doc, y])) elif dtype == 'test': np.save(outFile, train_doc) fw = open(outFile + "_doc.txt", 'w') for y in MySentences(inFile, dtype, 'get_info'): fw.write(y.encode('utf8') + "\n") fw.close()
save_model = 1 dataframe = pd.read_csv('ign.csv').ix[:, 1:3] # Fill null values with empty strings dataframe.fillna(value='', inplace=True) # score phrase and title print(dataframe.columns.values) # Extract the required columns for inputs and outputs totalX = dataframe.title totalY = dataframe.score_phrase # Convert the strings in the input into integers corresponding to the dictionary positions # Data is automatically padded so we need to pad_sequences manually vocab_proc = VocabularyProcessor(15) # max document length totalX = np.array(list(vocab_proc.fit_transform(totalX))) # totalX contains a matrix with the word idi (indexes) of the sentences # We will have 11 classes in total for prediction, indices from 0 to 10 vocab_proc2 = VocabularyProcessor(1) totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1 # here totalY is numbered dictionary entries (0, 1, ... to 10) print(totalY[5]) # Convert the indices into 11 dimensional vectors totalY = to_categorical(totalY, 11) # here totalY is a binary matrix print(totalY[5])
class DatasetVectorizer: def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] if char_embeddings: log('Chosen char embeddings.') self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)] else: log('Chosen word embeddings.') self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)] max_sentence_length = max(self.sentences_lengths) log('Maximum sentence length : {}'.format(max_sentence_length)) if char_embeddings: log('Processing sentences with char embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, tokenizer_fn=char_tokenizer, ) else: log('Processing sentences with word embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, ) log('Sentences have been successfully processed.') self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence_pairs): num_instances, num_classes = raw_sentence_pairs.shape raw_sentence_pairs = raw_sentence_pairs.ravel() for i, v in enumerate(raw_sentence_pairs): if v is np.nan: print(i, v) vectorized_sentence_pairs = np.array(list(self.vocabulary.transform(raw_sentence_pairs))) vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(num_instances, num_classes, self.max_sentence_len) vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :] vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :] return vectorized_sentence1, vectorized_sentence2
def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
from tflearn.data_utils import VocabularyProcessor vocab = {'hello':3, '.':5, 'world':20, '/' : 10} sentences = ['hello world . / hello', 'hello'] vocab_processor = VocabularyProcessor(max_document_length=6, vocabulary=vocab) encoded = list(vocab_processor.transform(sentences)) print(encoded)
def run_on_ign(): df_dataset = pd.read_csv(ign_dataset_path) df_dataset.set_index(['index'], inplace=True) # fill null values with empty strings df_dataset.fillna(value='', inplace=True) # extract the required columns for inputs and outputs data_X = df_dataset.title # data_X[0] label_Y = df_dataset.score_phrase # label_Y[5] # convert the strings in the input into integers corresponding to the dictionary positions # maps documents to sequences of word ids # data is automatically padded so we need to pad_sequences manually vocab_proc = VocabularyProcessor(15) total_X = np.array(list(vocab_proc.fit_transform(data_X))) # total_X[0] # we will have 11 classes in total for prediction, indices from 0 to 10 # vocabulary processor for single word vocab_proc2 = VocabularyProcessor(1) total_Y = np.array(list(vocab_proc2.fit_transform(label_Y))) - 1 # total_Y[5] # len(total_Y) # as we have 11 unique score_phrase # convert the indices into 11 dimensional vectors # This is wrong as it generate same array for different score_phrase # total_Y = to_categorical(total_Y, nb_classes=11) array_list = [] for array in total_Y: array_list.append(get_categorical(array, 11)) total_Y = np.array(array_list) # total_Y[4] # split into training and testing data train_X, test_X, train_Y, test_Y = train_test_split(total_X, total_Y, test_size=0.1) # build the network for classification # each input has length of 15 net = tflearn.input_data([None, 15]) # the 15 input word integers are then casted out into 256 dimensions each creating a word embedding. # we assume the dictionary has 10000 words maximum net = tflearn.embedding(net, input_dim=10000, output_dim=256) # each input would have a size of 15x256 and each of these 256 sized vectors are fed into the LSTM layer one at a time. # all the intermediate outputs are collected and then passed on to the second LSTM layer. net = tflearn.gru(net, 256, dropout=0.9, return_seq=True) # using the intermediate outputs, we pass them to another LSTM layer and collect the final output only this time net = tflearn.gru(net, 256, dropout=0.9) # the output is then sent to a fully connected layer that would give us our final 11 classes net = tflearn.fully_connected(net, 11, activation='softmax') # we use the adam optimizer instead of standard SGD since it converges much faster net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') model = tflearn.DNN(net, tensorboard_verbose=0) if check_file_exist(ign_model_path): model.load(ign_model_path) model.fit(train_X, train_Y, validation_set=(test_X, test_Y), show_metric=True, batch_size=32, n_epoch=20) if save_model: print("Saving model as './ign_model.tfl'") model.save(ign_model_path) return 0
class CNN(object): def __init__(self, batch_size=64): self.batch_size = batch_size self.number_of_classes = 2 self.X_train = [] self.X_test = [] self.Y_train = [] self.Y_test = [] self.max_words = None self.vocabProcessor = None self.cnn_model = None self.model = None self.test_x = [] self.test_y = [] def load_dataset_training(self, vocab_name, filename='datasetWithoutNeutral'): """ Load the dataset """ X, Y = load_csv('datasets/' + filename, target_column=2, columns_to_ignore=[0]) """ Count max words from the longest sentence """ self.max_words = max([len(x[0].split(" ")) for x in X]) """ Get vocabulare size from longest sentence """ self.vocabProcessor = VocabularyProcessor(self.max_words) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(Y) Y = labelEncoder.transform(Y) """ Change the list of sentences to a list of sequence of words """ X = np.array(list(self.vocabProcessor.fit_transform([x[0] for x in X]))) """ Split the datasets to training set and test test """ self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split( X, Y, test_size=0.10, random_state=7) """ Pad the sequences to fit the longest sentence """ self.X_train = pad_sequences(self.X_train, maxlen=self.max_words, value=0.) self.X_test = pad_sequences(self.X_test, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.Y_train = to_categorical(self.Y_train, nb_classes=self.number_of_classes) self.Y_test = to_categorical(self.Y_test, nb_classes=self.number_of_classes) self.vocabProcessor.save(vocab_name) def create_cnn_architecture_two_layers( self, model_name, outputDim=300, number_of_filters=60, filterSize=[3, 4], padding='same', activation_function_convLayer='relu', regularizer='L2', dropouts=0.5, activation_function_fc='softmax', optimizer='adam', learning_rate=0.001, loss_function='categorical_crossentropy'): if len(filterSize) == 0: filterSize = [3, 4] """ Define input shape and create word embedding """ self.cnn_model = input_data(shape=[None, self.max_words], name='input') self.cnn_model = tflearn.embedding( self.cnn_model, input_dim=len(self.vocabProcessor.vocabulary_), output_dim=outputDim) """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """ conv1 = conv_1d(self.cnn_model, nb_filter=number_of_filters, filter_size=filterSize[0], padding=padding, activation=activation_function_convLayer, regularizer=regularizer) conv2 = conv_1d(self.cnn_model, nb_filter=number_of_filters, filter_size=filterSize[1], padding=padding, activation=activation_function_convLayer, regularizer=regularizer) #conv3 = conv_1d(cnn_model, nb_filter = 128, filter_size = 5, padding = 'same', # activation = 'relu', regularizer = 'L2') self.cnn_model = merge([conv1, conv2], mode='concat', axis=1) """ Expand one dimension to fit the max_pooling layer """ self.cnn_model = tf.expand_dims(self.cnn_model, 1) self.cnn_model = global_max_pool(self.cnn_model) """ Instantiate dropout layer and specify dropout parameter """ self.cnn_model = dropout(self.cnn_model, dropouts) """ Instantiate fully connected layer and regression layer. """ self.cnn_model = fully_connected(self.cnn_model, self.number_of_classes, activation=activation_function_fc) self.cnn_model = regression(self.cnn_model, optimizer=optimizer, learning_rate=learning_rate, loss=loss_function, name='models/' + model_name) def train_and_save(self, model_name, tensorboard_verbose=0, tensorboard_dir='/logs/', nb_epochs=5, shuffle=True, show_metric=True): """ Instantiate Deep neural network model and start the training """ self.model = tflearn.DNN(self.cnn_model, tensorboard_verbose=tensorboard_verbose, tensorboard_dir=tensorboard_dir) self.model.fit(self.X_train, self.Y_train, n_epoch=nb_epochs, validation_set=(self.X_test, self.Y_test), shuffle=shuffle, show_metric=show_metric, batch_size=self.batch_size, run_id=model_name) """ Save the model """ self.model.save('models/' + model_name) def load_model(self, model_name, outputDim=300, number_of_filters=60, filterSize=[3, 4], padding='same', activation_function_convLayer='relu', regularizer='L2', dropouts=0.5, activation_function_fc='softmax', optimizer='adam', learning_rate=0.001, loss_function='categorical_crossentropy', tensorboard_verbose=0, tensorboard_dir='/logs/'): """ Has to pass the same values that the models were trained with. If the model was trained on default values, the parameters will pass it automatically. """ self.create_cnn_architecture_two_layers( model_name, outputDim, number_of_filters, filterSize, padding, activation_function_convLayer, regularizer, dropouts, activation_function_fc, optimizer, learning_rate, loss_function) self.model = tflearn.DNN(self.cnn_model, tensorboard_verbose=tensorboard_verbose, tensorboard_dir=tensorboard_dir) self.model.load('models/' + model_name) def load_test_dataset(self, filename='testDatasetWithOutNeuTwo', vocab_name='vocabProc'): """ Something is wrong with this function. Does not get the same result as before when loading in the new data... """ """ Load test dataset """ self.test_x, self.test_y = load_csv('datasets/' + filename, target_column=1) """ Get restored vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(self.test_y) self.test_y = labelEncoder.transform(self.test_y) """ Change the list of sentences to a list of sequence of words """ self.test_x = np.array( list(self.vocabProcessor.transform([x[0] for x in self.test_x]))) """ Pad the sequences to fit the longest sentence """ self.test_x = pad_sequences(self.test_x, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.test_y = to_categorical(self.test_y, nb_classes=self.number_of_classes) def evaluate_model_performance(self): metrix_score = self.model.evaluate(self.test_x, self.test_y, batch_size=self.batch_size) return metrix_score def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'): """ Load vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Transorm sentence to matrix of numbers """ sentence = np.array( list(self.vocabProcessor.transform([x[0] for x in sentence]))) sentence = pad_sequences( sentence, max_len=self.vocabProcessor.max_document_length, value=0.) """ Predict sentence """ pred_score = self.model.predict(sentence) return pred_score def predict_list(self, list_of_sentences=[[''], ['']], vocab_name='vocabProc'): """ Load vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Transorm sentence to matrix of numbers """ sentence = np.array( list(self.vocabProcessor.transform([x[0] for x in sentece]))) sentence = pad_sequences( sentence, max_len=self.vocabProcessor.max_document_length, value=0.) """ Predict sentence """ pred_score = self.model.predict(list_of_sentences) return pred_score
from tflearn.data_utils import pad_sequences, to_categorical, load_csv from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_1d, global_max_pool, max_pool_1d from tflearn.layers.merge_ops import merge from tflearn.layers.estimator import regression batch_size = 64 """ Load the dataset """ #X, Y = load_csv('datasetFullList', target_column = 2, columns_to_ignore = [0]) X, Y = load_csv('datasetWithoutNeutral', target_column=2, columns_to_ignore=[0]) """ Count max words from the longest sentence """ max_words = max([len(x[0].split(" ")) for x in X]) """ Get vocabulare size from longest sentence """ vocab = VocabularyProcessor(max_words) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(Y) Y = labelEncoder.transform(Y) """ Change the list of sentences to a list of sequence of words """ X = np.array(list(vocab.fit_transform([x[0] for x in X]))) """ Split the datasets to training set and test test """ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=7) """ Pad the sequences to fit the longest sentence """ X_train = pad_sequences(X_train, maxlen=max_words, value=0.) X_test = pad_sequences(X_test, maxlen=max_words, value=0.) """ Convert labels to binary vector """
data.loc[data["score_phrase"] == "Disaster", "score_phrase"] = 0 data.loc[data["score_phrase"] == "Unbearable", "score_phrase"] = 0 data.loc[data["score_phrase"] == "Painful", "score_phrase"] = 0 data.loc[data["score_phrase"] == "Bad", "score_phrase"] = 0 data.loc[data["score_phrase"] == "Mediocre", "score_phrase"] = 0 data.loc[data["score_phrase"] == "Awful", "score_phrase"] = 0 data.loc[data["score_phrase"] == "Amazing", "score_phrase"] = 1 data.loc[data["score_phrase"] == "Great", "score_phrase"] = 1 data.loc[data["score_phrase"] == "Okay", "score_phrase"] = 1 data.loc[data["score_phrase"] == "Masterpiece", "score_phrase"] = 1 data.loc[data["score_phrase"] == "Good", "score_phrase"] = 1 # tokenize title # data["token_title"] = data["title"].apply(nltk.word_tokenize) word_processor = VocabularyProcessor(100) tmp = np.array(list(word_processor.fit_transform(data["title"]))) # split into train and test data train_data, test_data = model_selection.train_test_split(data, train_size=0.9) trainX = np.array(list(word_processor.fit_transform(train_data["title"]))) trainY = train_data.loc[:, ["score_phrase"]].as_matrix() testX = np.array(list(word_processor.fit_transform(test_data["title"]))) testY = test_data.loc[:, ["score_phrase"]].as_matrix() # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100])
'\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' ) # print(type(df.num.apply(str))) # print(type(df.text)) text = df['text'].astype(str) # num = df['num'].astype(str) # print(type(text)) # print(text) # Extract the required columns for inputs and outputs totalX = text totalY = df.num # # Convert the strings in the input into integers corresponding to the dictionary positions # # Data is automatically padded so we need to pad_sequences manually vocab_proc = VocabularyProcessor(10) totalX = np.array(list(vocab_proc.fit_transform(totalX))) seed = np.array(list(vocab_proc.fit_transform(seed))) # # We will have 11 classes in total for prediction, indices from 0 to 10 vocab_proc2 = VocabularyProcessor(1) totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1 # # Convert the indices into 11 dimensional vectors totalY = to_categorical(totalY, nb_classes=10) # Split into training and testing data # trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1) totalX = pad_sequences(totalX, maxlen=100, value=0.) seed = pad_sequences(seed, maxlen=100, value=0.) # testX = pad_sequences(testX, maxlen=100, value=0.) # print(trainX) # print(trainY)
def chinese_tokenizer(documents): """ 把中文文本转为词序列 """ for document in documents: # 繁体转简体 text = HanziConv.toSimplified(document) # 英文转小写 text = text.lower() # 分词 yield list(cut(text)) # 序列长度填充或截取到100,删除词频<=2的词 vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer) # 创建词汇表,创建后不能更改 vocab.fit(DOCUMENTS) # 保存和加载词汇表 vocab.save('vocab.pickle') vocab = VocabularyProcessor.restore('vocab.pickle') # 文本转为词ID序列,未知或填充用的词ID为0 id_documents = list(vocab.transform(DOCUMENTS)) for id_document in id_documents: print(id_document) # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
save_model = 0 # Select only the two columns we require. Game title and its corresponding emotion dataframe = pd.read_csv('ign.csv').ix[:, 1:3] # Fill null values with empty strings dataframe.fillna(value='', inplace=True) # print(dataframe.score_phrase.value_counts()) # Extract the required columns for inputs and outputs totalX = dataframe.title totalY = dataframe.score_phrase # Convert the strings in the input into integers corresponding to the dictionary positions # Data is automatically padded so we need to pad_sequences manually vocab_proc = VocabularyProcessor(15) totalX = np.array(list(vocab_proc.fit_transform(totalX))) # We will have 11 classes in total for prediction, indices from 0 to 10 vocab_proc2 = VocabularyProcessor(1) totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1 # Convert the indices into 11 dimensional vectors totalY = to_categorical(totalY, nb_classes=11) # Split into training and testing data trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1) # Build the network for classification # Each input has length of 15 net = tflearn.input_data([None, 15]) # The 15 input word integers are then casted out into 256 dimensions each creating a word embedding.