def __init__(self, ratings_filename='../Data/ratings.npz', n_hidden_factors=10): self.data = DataLoader.ratings_data(ratings_filename) self.n_users, self.n_items = self.data.shape self.n_hidden_factors = n_hidden_factors self.corpus_ix = self.data.nonzero() self.alpha = np.random.uniform() self.beta_user = np.random.rand(self.n_users) self.beta_item = np.random.rand(self.n_items) self.gamma_user = np.random.rand(self.n_users, self.n_hidden_factors) self.gamma_item = np.random.rand(self.n_items, self.n_hidden_factors) self.predicted_rating = np.zeros((self.n_users, self.n_items))
def __init__(self, reviews_filename='../Data/reviews.npz', n_topics=10): data = DataLoader.review_data(reviews_filename) self.n_docs, self.n_vocab = data.shape self.n_topics = n_topics self.phi = np.random.rand(n_topics, self.n_vocab) self.phi /= self.phi.sum(axis=1)[:, None] self.theta = np.random.rand(self.n_docs, n_topics) self.theta /= self.theta.sum(axis=1)[:, None] self.topic_frequencies = np.zeros((self.n_docs, self.n_topics)) self.word_topic_frequencies = np.zeros((self.n_topics, self.n_vocab)) self.backgroundwords = np.zeros(self.n_vocab) self.z = list() self.reviews = list() for doc_ix in xrange(self.n_docs): data_review = flatten_bow(data[doc_ix, :].toarray()[0]) n_words = len(data_review) self.z.append(np.zeros(n_words, dtype=int)) self.reviews.append(data_review) np.add.at(self.backgroundwords, data_review, 1.0) self.backgroundwords /= np.sum(self.backgroundwords)
x_char = [] for sent in x_tmp2: padded_sent = sent pad = padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % padsize) return x_char """ Preparing file """ train = DL('ner_3_train.ner') test = DL('ner_3_test.ner') # val = DL('id-ud-dev.pos') # train.add('id-ud-dev.pos') """ Load pre-trained word embedding """ embeddings_index = {} # WE_DIR = raw_input('Enter word embedding file name: ') WE_DIR = 'polyglot.vec' print 'Loading', WE_DIR, '...' f = open(WE_DIR, 'r') for line in f: values = line.split()
def activationPrompt(name): print "List of Activation Functions\n" \ "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \ "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear" choice = input('Enter type of activation function for ' + name + ': ') activations = ['softmax', 'elu', 'selu', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] return activations[choice - 1] """ Preparing file """ train_name = sys.argv[1] train = DL(train_name) # percentage = input('Enter percentage of data to take: ') percentage = 0.9 seed = sys.argv[2] # seed = input('Enter seed for slicing data: ') train.slice(percentage, seed) test_name = sys.argv[1] test = DL(test_name) test.antislice(percentage, seed) """ Load pre-trained word embedding """ embeddings_index = {} # WE_DIR = raw_input('Enter word embedding file name: ')
x_char = [] for sent in x_tmp2: padded_sent = sent pad = padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % padsize) return x_char """ Preparing file """ train = DL('ner_3_train.ner') percentage = float(sys.argv[1]) # input('Enter percentage of data to take: ') seed = int(sys.argv[2]) # input('Enter seed for slicing data: ') train.slice(percentage, seed) test = DL('ner_3_test.ner') # val = DL('ner_3_train.ner') # train.add('id-ud-dev.pos') """ Load pre-trained word embedding """ embeddings_index = {} WE_DIR = 'polyglot.vec' print 'Loading', WE_DIR, '...' f = open(WE_DIR, 'r')
def createModel(self, traindata, valdata, testdata, wordemb, charemb): self.train = DL(traindata) self.val = DL(valdata) self.test = DL(testdata) # Load pre-trained embedding embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb) char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader( charemb) # Create Word & Label Index self.char = DI(self.train.words + ce_words) self.word = DI([self.train.words, [we_words]]) self.label = DI([self.train.labels]) print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' # Create word embedding matrix self.EMBEDDING_DIM = len(self.coefs) embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) for wrd, i in self.word.index.items(): embedding_vector = embeddings_index.get(wrd) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # Create char embedding matrix char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.EMBEDDING_DIM))) for chars, i in self.char.index.items(): embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector trimlen = self.padsize self.train.trim(trimlen) self.test.trim(trimlen) self.val.trim(trimlen) self.x_train = DM(self.train.words, self.word.index) self.x_test = DM(self.test.words, self.word.index) self.x_val = DM(self.val.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov print "Number of OOV (val):", len(self.x_val.oov_index) print "OOV word occurences (val):", self.x_val.oov padsize = self.padsize self.x_train.pad(padsize) self.x_test.pad(padsize) self.x_val.pad(padsize) print('Padded until %s tokens.' % padsize) self.y_train = DM(self.train.labels, self.label.index) self.y_test = DM(self.test.labels, self.label.index) self.y_val = DM(self.val.labels, self.label.index) self.y_train.pad(padsize) self.y_test.pad(padsize) self.y_val.pad(padsize) self.y_encoded = to_categorical(self.y_train.padded) self.y_val_enc = to_categorical(self.y_val.padded) # Converting char text data to int using index self.x_test_char = self.convertCharText2Int(self.test) self.x_train_char = self.convertCharText2Int(self.train) self.x_val_char = self.convertCharText2Int(self.val) # Create keras word model MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = self.dropout_embedding dropout = Dropout(rate=drop)(embedded_sequences) # Create keras char model def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize embedding_layer_c = Embedding(len(self.char.index) + 1, self.EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = self.dropout_gru rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) # Combine word + char model merge_m = 'concat' merge = Concatenate()([dropout, rtwo]) gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(merge) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = self.optimizer loss = self.loss self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
class SeqTagger: mask = True # mask pad (zeros) or not EMBEDDING_DIM = 64 CHAR_EMBEDDING_DIM = 64 padsize = 188 char_padsize = 41 dropout_embedding = 0.4 dropout_gru = 0.5 optimizer = 'adagrad' loss = 'poisson' patience = 3 def __init__(self): self.coefs = [] self.textinput = '' self.train = '' self.val = '' self.test = '' self.char = '' self.word = '' self.label = '' self.x_train = '' self.x_val = '' self.x_test = '' self.x_test_char = '' self.y_train = '' self.y_test = '' self.y_val = '' self.y_encoded = '' self.y_val_enc = '' self.x_test_char = '' self.x_train_char = '' self.x_val_char = '' self.model = {} self.results = [] self.data = {} self.json_data = {} def createModel(self, traindata, valdata, testdata, wordemb, charemb): self.train = DL(traindata) self.val = DL(valdata) self.test = DL(testdata) # Load pre-trained embedding embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb) char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader( charemb) # Create Word & Label Index self.char = DI(self.train.words + ce_words) self.word = DI([self.train.words, [we_words]]) self.label = DI([self.train.labels]) print 'Found', self.word.cnt - 1, 'unique words.' print 'Found', self.char.cnt - 1, 'unique chars.' print 'Found', self.label.cnt - 1, 'unique labels.' # Create word embedding matrix self.EMBEDDING_DIM = len(self.coefs) embedding_matrix = np.zeros( (len(self.word.index) + 1, int(self.EMBEDDING_DIM))) for wrd, i in self.word.index.items(): embedding_vector = embeddings_index.get(wrd) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # Create char embedding matrix char_embedding_matrix = np.zeros( (len(self.char.index) + 1, int(self.EMBEDDING_DIM))) for chars, i in self.char.index.items(): embedding_vector = char_embeddings_index.get(chars) if embedding_vector is not None: char_embedding_matrix[i] = embedding_vector trimlen = self.padsize self.train.trim(trimlen) self.test.trim(trimlen) self.val.trim(trimlen) self.x_train = DM(self.train.words, self.word.index) self.x_test = DM(self.test.words, self.word.index) self.x_val = DM(self.val.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov print "Number of OOV (val):", len(self.x_val.oov_index) print "OOV word occurences (val):", self.x_val.oov padsize = self.padsize self.x_train.pad(padsize) self.x_test.pad(padsize) self.x_val.pad(padsize) print('Padded until %s tokens.' % padsize) self.y_train = DM(self.train.labels, self.label.index) self.y_test = DM(self.test.labels, self.label.index) self.y_val = DM(self.val.labels, self.label.index) self.y_train.pad(padsize) self.y_test.pad(padsize) self.y_val.pad(padsize) self.y_encoded = to_categorical(self.y_train.padded) self.y_val_enc = to_categorical(self.y_val.padded) # Converting char text data to int using index self.x_test_char = self.convertCharText2Int(self.test) self.x_train_char = self.convertCharText2Int(self.train) self.x_val_char = self.convertCharText2Int(self.val) # Create keras word model MAX_SEQUENCE_LENGTH = self.padsize embedding_layer = Embedding(len(self.word.index) + 1, self.EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, mask_zero=self.mask) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) drop = self.dropout_embedding dropout = Dropout(rate=drop)(embedded_sequences) # Create keras char model def reshape_one(c): return K.reshape(c, (tf.shape(c)[0] * self.padsize, self.char_padsize, self.CHAR_EMBEDDING_DIM)) def reshape_two(c): if merge_m_c == 'concat': return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM * 2)) else: return K.reshape(c, (tf.shape(c)[0] / self.padsize, self.padsize, self.CHAR_EMBEDDING_DIM)) MAX_WORD_LENGTH = self.char_padsize embedding_layer_c = Embedding(len(self.char.index) + 1, self.EMBEDDING_DIM, weights=[char_embedding_matrix], input_length=MAX_WORD_LENGTH, mask_zero=self.mask) sequence_input_c = Input(shape=( self.padsize, MAX_WORD_LENGTH, ), dtype='int32') embedded_sequences_c = embedding_layer_c(sequence_input_c) dropout_c = Dropout(rate=drop)(embedded_sequences_c) rone = Lambda(reshape_one)(dropout_c) merge_m = 'concat' merge_m_c = merge_m dropout_gru = self.dropout_gru rec_dropout = dropout_gru gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM, return_sequences=False, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(rone) rtwo = Lambda(reshape_two)(gru_karakter) # Combine word + char model merge_m = 'concat' merge = Concatenate()([dropout, rtwo]) gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3, return_sequences=True, dropout=dropout_gru, recurrent_dropout=rec_dropout), merge_mode=merge_m, weights=None)(merge) crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata) self.model = Model(inputs=[sequence_input, sequence_input_c], outputs=[crf]) optimizer = self.optimizer loss = self.loss self.model.summary() self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc']) def trainFit(self): val_data = ([np.array(self.x_val.padded), np.array(self.x_val_char)], [np.array(self.y_val_enc)]) callback = EarlyStopping(monitor='val_loss', patience=self.patience, verbose=0, mode='auto') self.model.fit( [np.array(self.x_train.padded), np.array(self.x_train_char)], [np.array(self.y_encoded)], validation_data=val_data, validation_split=0.1, epochs=100, batch_size=32, callbacks=[callback]) def evaluate(self): self.results = [] print "Computing..." raw_results = self.model.predict( [np.array(self.x_test.padded), np.array(self.x_test_char)]) for raw_result in raw_results: result = [] for token in raw_result: value = np.argmax(token) result.append(value) self.results.append(result) label_index = range(1, len(self.label.index) + 1) label_names = [] for key, value in sorted(self.label.index.iteritems(), key=lambda (k, v): (v, k)): label_names.append(key) # flatten list for sklearn evaluation y_true = [item for sublist in self.y_test.padded for item in sublist] y_pred = [item for sublist in self.results for item in sublist] print "Sklearn evaluation:" print classification_report(y_true, y_pred, labels=label_index, target_names=label_names) f1_mac = f1_score(y_true, y_pred, labels=label_index[1:], average='macro') f1_mic = f1_score(y_true, y_pred, labels=label_index[1:], average='micro') f1 = max([f1_mac, f1_mic]) print 'F-1 Score:' print f1 def predict(self, text): self.textinput = text self.test = DP(text) self.x_test = DM(self.test.words, self.word.index) print "Number of OOV:", len(self.x_test.oov_index) print "OOV word occurences:", self.x_test.oov self.x_test.pad(self.padsize) print('Padded until %s tokens.' % self.padsize) self.x_test_char = self.convertCharText2Int(self.test) self.results = [] print "Computing..." print self.x_test.padded print self.x_test_char raw_results = self.model.predict( [np.array(self.x_test.padded), np.array(self.x_test_char)]) for raw_result in raw_results: result = [] for token in raw_result: value = np.argmax(token) result.append(value) self.results.append(result) temp = self.results[0] li = self.label.index keys = li.keys() values = li.values() self.results = [] start = False for token in temp: if token != 0: start = True if start: if token == 0: self.results.append('?') else: self.results.append(keys[values.index(token)]) print self.test.words[0] print self.results self.data = {'words': self.test.words[0], 'labels': self.results} self.json_data = json.dumps(self.data) return self.json_data def log(self): self.textoutput = '' for token in self.results: self.textoutput = self.textoutput + token + ' ' rnow = datetime.now() logcsv = open('log.csv', 'a') writer = csv.writer(logcsv, delimiter=',') writer.writerow([ 'no', str(rnow.date()), str(rnow.time())[:-10], self.w_name, self.word.cnt - 1, self.char.cnt - 1, self.textinput, len(self.x_test.oov_index), self.textoutput ]) logcsv.close() def convertCharText2Int(self, dataload): x_tmp1 = [] for sent in dataload.words: x_map = DM(sent, self.char.index, False) if x_map.padsize > self.char_padsize: self.char_padsize = x_map.padsize x_tmp1.append(x_map) x_tmp2 = [] for sent in x_tmp1: sent.pad(self.char_padsize) x_tmp2.append(sent.padded) print('Padded until %s chars.' % self.char_padsize) zeroes = [] for i in range(self.char_padsize): zeroes.append(0) x_char = [] for sent in x_tmp2: padded_sent = sent pad = self.padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % self.padsize) return x_char def pretrainedEmbeddingLoader(self, filename): embeddings_index = {} f = open(filename, 'r') for line in f: values = line.split() token = values[0] self.coefs = np.asarray(values[1:], dtype='float32') embeddings_index[token] = self.coefs f.close() embs = [] for token in embeddings_index: embs.append(token) return embeddings_index, embs def saveModel(self, w_name): for i in range(len(self.model.layers)): with open(w_name + '_' + str(i) + '.wgt', 'wb') as fp: pickle.dump(self.model.layers[i].get_weights(), fp) def loadModel(self, w_name): m_layers_len = len(self.model.layers) for i in range(m_layers_len): with open(w_name + '_' + str(i) + ".wgt", "rb") as fp: w = pickle.load(fp) self.model.layers[i].set_weights(w)
print "List of Activation Functions\n" \ "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \ "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear" choice = input('Enter type of activation function for ' + name + ': ') activations = [ 'softmax', 'elu', 'selu', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear' ] return activations[choice - 1] """ Preparing file """ train = DL('ner_2_train.ner') percentage = input('Enter percentage of data to take: ') seed = input('Enter seed for slicing data: ') train.slice(percentage, seed) test = DL('ner_2_test.ner') """ Load pre-trained word embedding """ embeddings_index = {} WE_DIR = raw_input('Enter word embedding file name: ') # WE_DIR = 'polyglot.vec' print 'Loading', WE_DIR, '...' f = open(WE_DIR, 'r') for line in f:
def activationPrompt(name): print "List of Activation Functions\n" \ "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \ "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear" choice = input('Enter type of activation function for ' + name + ': ') activations = ['softmax', 'elu', 'selu', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] return activations[choice-1] """ Preparing file """ train = DL('id-ud-train.pos') test = DL('id-ud-test.pos') train.add('id-ud-dev.pos') """ Load pre-trained word embedding """ embeddings_index = {} WE_DIR = raw_input('Enter word embedding file name: ') # WE_DIR = 'polyglot.vec' print 'Loading', WE_DIR, '...' f = open(WE_DIR, 'r') for line in f: values = line.split()
padded_sent = sent pad = padsize - len(sent) for i in range(pad): padded_sent = np.vstack((zeroes, padded_sent)) x_char.append(padded_sent) print('Padded until %s tokens.' % padsize) return x_char """ Preparing file """ percentage = 0.9 seed = sys.argv[1] train = DL('ner_3_train.ner') train.slice(percentage, seed) test = DL('ner_3_train.ner') test.antislice(percentage, seed) val = DL('ner_3_train.ner') val.antislice(percentage, seed) # train.add('id-ud-dev.pos') """ Load pre-trained word embedding """ embeddings_index = {} # WE_DIR = raw_input('Enter word embedding file name: ') WE_DIR = 'polyglot.vec'