def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file))
def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) if os.path.exists(resource_dir + '/label2Idx.json'): with open(resource_dir + '/label2Idx.json') as fi: self.label2Idx = json.load(fi) self.idx2Label = {v: k for k, v in self.label2Idx.items()}
def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) # print(os.path.join(resource_dir, 'sst.trn.tsv')) # self.train = pd.read_csv(os.path.join(resource_dir, 'sst.trn.tsv'), sep='\t') # TODO: to be filled. self.net = Net() self.resource_dir = os.environ.get('RESOURCE')
class NamedEntityRecognizer(Component): def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) if os.path.exists(resource_dir + '/label2Idx.json'): with open(resource_dir + '/label2Idx.json') as fi: self.label2Idx = json.load(fi) self.idx2Label = {v: k for k, v in self.label2Idx.items()} # TODO: to be filled. def load(self, model_path: str, **kwargs): """ Load the pre-trained model. :param model_path: :param kwargs: """ # TODO: to be filled json_file = open(model_path + '/model.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) # load weights into new model self.model.load_weights(model_path + "/model.h5") print("Loaded model from disk") # pass def save(self, model_path: str, **kwargs): """ Saves the current model to the path. :param model_path: :param kwargs: """ # TODO: to be filled if not os.path.exists(model_path): os.makedirs(model_path) model_json = self.model.to_json() with open(model_path + "/model.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 self.model.save_weights(model_path + "/model.h5") print("Saved model to disk") pass def train(self, trn_data: List[Tuple[List[str], List[str]]], dev_data: List[Tuple[List[str], List[str]]], *args, **kwargs): """ Trains the model. :param trn_data: the training data. :param dev_data: the development data. :param args: :param kwargs: :return: """ trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data]) dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data]) # TODO: to be filled labelSet = set() for dataset in [trn_ys, dev_ys]: for label_seq in dataset: for label in label_seq: labelSet.add(label) self.label2Idx = {} for label in labelSet: self.label2Idx[label] = len(self.label2Idx) self.label2Idx['PAD'] = len(self.label2Idx) with open(resource_dir + '/label2Idx.json', 'w') as fo: json.dump(self.label2Idx, fo) self.idx2Label = {v: k for k, v in self.label2Idx.items()} Y_train = self.createMatrices(trn_ys, self.label2Idx) Y_dev = self.createMatrices(dev_ys, self.label2Idx) train_xs = self.padding_training(trn_xs) devlop_xs = self.padding_training(dev_xs) Y_train_padding = self.padding_training_Y(Y_train) Y_dev_padding = self.padding_training_Y(Y_dev) train_ys = [ np_utils.to_categorical(i, num_classes=len(self.label2Idx)) for i in Y_train_padding ] train_ys = np.asarray(train_ys) devlop_ys = [ np_utils.to_categorical(i, num_classes=len(self.label2Idx)) for i in Y_dev_padding ] devlop_ys = np.asarray(devlop_ys) max_sentence_length = 125 embedding_dim = train_xs.shape[2] image_input = Input(shape=(max_sentence_length, embedding_dim)) output = Bidirectional( LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(image_input) output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output) self.model = Model(inputs=[image_input], outputs=output) self.model.compile(loss='categorical_crossentropy', optimizer='nadam') self.model.fit(train_xs, train_ys, batch_size=50, epochs=15, validation_data=(devlop_xs, devlop_ys)) def createMatrices(self, sentences, label2Idx): dataset = [] for sentence in sentences: labelIndices = [] for label in sentence: labelIndices.append(self.label2Idx[label]) dataset.append(labelIndices) return dataset def padding_training(self, trn_xs, max_sentence_length=125): blank_embedding = self.vsm.emb_list(' ')[0] train_xs = [] for line in trn_xs: padding = max_sentence_length - len(line) for i in range(0, padding): line.append(blank_embedding) train_xs.append(line) train_xs = np.array(train_xs) train_xs = train_xs.reshape(train_xs.shape[0], train_xs.shape[1], train_xs.shape[2]) return train_xs def padding_training_Y(self, Y_train, max_sentence_length=125): blank_embedding = len(self.label2Idx) - 1 Y_train_padding = [] for line in Y_train: padding = max_sentence_length - len(line) for i in range(0, padding): line.append(blank_embedding) Y_train_padding.append(line) Y_train_padding = np.array(Y_train_padding) return Y_train_padding # pass def decode(self, data: List[Tuple[List[str], List[str]]], **kwargs) -> List[str]: """ :param data: :param kwargs: :return: the list of predicted labels. """ xs = [self.vsm.emb_list(x) for _, x in data] # TODO: to be filled padding_xs = self.padding_training(xs) padding_pred = self.model.predict(padding_xs) padding_idx = np.argmax(padding_pred, axis=2) padding_labels = [] for sentences in padding_idx: token_label = [] for l in sentences: if l != len(self.label2Idx) - 1: token_label.append(self.idx2Label[l]) padding_labels.append(token_label) # y_classes = pred.argmax(axis=-1) return padding_labels def evaluate(self, data: List[Tuple[List[str], List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ preds = self.decode(data) labels = [y for y, _ in data] acc = ChunkF1() for pred, label in zip(preds, labels): acc.update(pred, label) return float(acc.get()[1])
def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) self.resource_dir = resource_dir trn_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.trn.tsv')) dev_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.dev.tsv')) tst_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.tst.tsv')) token_dic = {} for sentences in trn_data + dev_data + tst_data: for words in sentences: token = words[0] token_dic[token] = True tokens = list(token_dic.keys()) tokens_emb = self.vsm.emb_list(tokens) trn_sentence = self.get_char_inform(trn_data) dev_sentence = self.get_char_inform(dev_data) tst_sentence = self.get_char_inform(tst_data) ## parepare labe and words label_set = set() words = {} for dataset in [trn_sentence, dev_sentence, tst_sentence]: for sentence in dataset: for token, char, label in sentence: if label != 'XX': label_set.add(label) words[token.lower()] = True ## label index label_idx = {} for label in label_set: label_idx[label] = len(label_idx) self.label_idx = label_idx ## case index and case embedding case_idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } self.case_embeddings = np.identity(len(case_idx), dtype='float32') self.case_idx = case_idx ## word to index and word embedding word_idx = {} word_embeddings = [] df = pd.DataFrame([tokens, tokens_emb]) combine_embeddings = df.T.values.tolist() # for line in combine_embeddings: for i in range(len(combine_embeddings)): split = combine_embeddings[i] word = split[0] if len(word_idx) == 0: word_idx["PADDING_TOKEN"] = len(word_idx) vector = np.zeros(len(split[1])) word_embeddings.append(vector) word_idx["UNKNOWN_TOKEN"] = len(word_idx) vector = np.random.uniform(-0.25, 0.25, len(split[1])) word_embeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1]]) word_embeddings.append(vector) word_idx[split[0]] = len(word_idx) self.word_idx = word_idx self.word_embeddings = np.array(word_embeddings) ## char index char_idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char_idx[c] = len(char_idx) self.char_idx = char_idx ## prepare dataset train_set = self.padding( self.get_embedded_data(trn_sentence, word_idx, label_idx, case_idx, char_idx)) dev_set = self.padding( self.get_embedded_data(dev_sentence, word_idx, label_idx, case_idx, char_idx)) test_set = self.padding( self.get_embedded_data(tst_sentence, word_idx, label_idx, case_idx, char_idx)) self.idx2Label = {v: k for k, v in label_idx.items()} self.train_batch, self.train_batch_len = self.get_batch(train_set) self.dev_batch, self.dev_batch_len = self.get_batch(dev_set) self.test_batch, self.test_batch_len = self.get_batch(test_set)
class NamedEntityRecognizer(Component): def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) self.resource_dir = resource_dir trn_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.trn.tsv')) dev_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.dev.tsv')) tst_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.tst.tsv')) token_dic = {} for sentences in trn_data + dev_data + tst_data: for words in sentences: token = words[0] token_dic[token] = True tokens = list(token_dic.keys()) tokens_emb = self.vsm.emb_list(tokens) trn_sentence = self.get_char_inform(trn_data) dev_sentence = self.get_char_inform(dev_data) tst_sentence = self.get_char_inform(tst_data) ## parepare labe and words label_set = set() words = {} for dataset in [trn_sentence, dev_sentence, tst_sentence]: for sentence in dataset: for token, char, label in sentence: if label != 'XX': label_set.add(label) words[token.lower()] = True ## label index label_idx = {} for label in label_set: label_idx[label] = len(label_idx) self.label_idx = label_idx ## case index and case embedding case_idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } self.case_embeddings = np.identity(len(case_idx), dtype='float32') self.case_idx = case_idx ## word to index and word embedding word_idx = {} word_embeddings = [] df = pd.DataFrame([tokens, tokens_emb]) combine_embeddings = df.T.values.tolist() # for line in combine_embeddings: for i in range(len(combine_embeddings)): split = combine_embeddings[i] word = split[0] if len(word_idx) == 0: word_idx["PADDING_TOKEN"] = len(word_idx) vector = np.zeros(len(split[1])) word_embeddings.append(vector) word_idx["UNKNOWN_TOKEN"] = len(word_idx) vector = np.random.uniform(-0.25, 0.25, len(split[1])) word_embeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1]]) word_embeddings.append(vector) word_idx[split[0]] = len(word_idx) self.word_idx = word_idx self.word_embeddings = np.array(word_embeddings) ## char index char_idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char_idx[c] = len(char_idx) self.char_idx = char_idx ## prepare dataset train_set = self.padding( self.get_embedded_data(trn_sentence, word_idx, label_idx, case_idx, char_idx)) dev_set = self.padding( self.get_embedded_data(dev_sentence, word_idx, label_idx, case_idx, char_idx)) test_set = self.padding( self.get_embedded_data(tst_sentence, word_idx, label_idx, case_idx, char_idx)) self.idx2Label = {v: k for k, v in label_idx.items()} self.train_batch, self.train_batch_len = self.get_batch(train_set) self.dev_batch, self.dev_batch_len = self.get_batch(dev_set) self.test_batch, self.test_batch_len = self.get_batch(test_set) def get_word_embd(self): return self.word_embeddings def get_case_emb(self): return self.case_embeddings def get_char2index(self): return self.char_idx def prepare_data(self, data): data = self.format_data(data) sentences = self.get_char_inform(data) dataset = self.padding( self.get_embedded_data(sentences, self.word_idx, self.label_idx, self.case_idx, self.char_idx)) batch, _ = self.get_batch(dataset) return batch def get_model(self): word_embeddings = self.word_embeddings case_embeddings = self.case_embeddings char_idx = self.char_idx label_idx = self.label_idx words_input = Input(shape=(None, ), dtype='int32', name='words_input') words = Embedding(input_dim=word_embeddings.shape[0], output_dim=word_embeddings.shape[1], weights=[word_embeddings], trainable=False)(words_input) casing_input = Input(shape=(None, ), dtype='int32', name='casing_input') casing = Embedding(output_dim=case_embeddings.shape[1], input_dim=case_embeddings.shape[0], weights=[case_embeddings], trainable=False)(casing_input) character_input = Input(shape=( None, 52, ), name='char_input') embed_char_out = TimeDistributed( Embedding(len(char_idx), 30, embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input) dropout = Dropout(0.5)(embed_char_out) conv1d_out = TimeDistributed( Conv1D(kernel_size=3, filters=30, padding='same', activation='tanh', strides=1))(dropout) maxpool_out = TimeDistributed(MaxPooling1D(52))(conv1d_out) char = TimeDistributed(Flatten())(maxpool_out) char = Dropout(0.5)(char) output = concatenate([words, casing, char]) output = Bidirectional( LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output) output = TimeDistributed(Dense(len(label_idx), activation='softmax'))(output) model = Model(inputs=[words_input, casing_input, character_input], outputs=[output]) model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam') model.summary() return model def get_casing(self, word, case_tag): casing = 'other' num_digit = 0 for char in word: if char.isdigit(): num_digit += 1 digit_frac = num_digit / float(len(word)) if word.isdigit(): casing = 'numeric' elif digit_frac > 0.5: casing = 'mainly_numeric' elif word.islower(): casing = 'allLower' elif word.isupper(): casing = 'allUpper' elif word[0].isupper(): casing = 'initialUpper' elif num_digit > 0: casing = 'contains_digit' return case_tag[casing] def get_batch(self, data): l = [] for i in data: l.append(len(i[0])) l = set(l) batches = [] batch_len = [] z = 0 for i in l: for batch in data: if len(batch[0]) == i: batches.append(batch) z += 1 batch_len.append(z) return batches, batch_len def get_embedded_data(self, sentences, word_idx, label_idx, case_idx, char_idx): unknownIdx = word_idx['UNKNOWN_TOKEN'] paddingIdx = word_idx['PADDING_TOKEN'] dataset = [] wordCount = 0 unknownWordCount = 0 for sentence in sentences: wordIndices = [] caseIndices = [] charIndices = [] labelIndices = [] for word, char, label in sentence: wordCount += 1 if word in word_idx: wordIdx = word_idx[word] elif word.lower() in word_idx: wordIdx = word_idx[word.lower()] else: wordIdx = unknownIdx unknownWordCount += 1 charIdx = [] for x in char: charIdx.append(char_idx[x]) # Get the label and map to int wordIndices.append(wordIdx) caseIndices.append(self.get_casing(word, case_idx)) charIndices.append(charIdx) if label != 'XX': labelIndices.append(label_idx[label]) dataset.append( [wordIndices, caseIndices, charIndices, labelIndices]) return dataset def get_mini_batch(self, dataset, batch_len): start = 0 for i in batch_len: tokens = [] caseing = [] char = [] labels = [] data = dataset[start:i] start = i for dt in data: t, c, ch, l = dt l = np.expand_dims(l, -1) tokens.append(t) caseing.append(c) char.append(ch) labels.append(l) yield np.asarray(labels), np.asarray(tokens), np.asarray( caseing), np.asarray(char) def get_char_inform(self, Sentences): for i, sentence in enumerate(Sentences): for j, data in enumerate(sentence): chars = [c for c in data[0]] Sentences[i][j] = [data[0], chars, data[1]] return Sentences def padding(self, Sentences): maxlen = 52 for sentence in Sentences: char = sentence[2] for x in char: maxlen = max(maxlen, len(x)) for i, sentence in enumerate(Sentences): Sentences[i][2] = pad_sequences(Sentences[i][2], 52, padding='post') return Sentences def load(self, model_path: str, **kwargs): """ Load the pre-trained model. :param model_path: :param kwargs: """ # TODO: to be filled self.model = load_model(model_path) return self.model def save(self, model_path: str, **kwargs): """ Saves the current model to the path. :param model_path: :param kwargs: """ # TODO: to be filled self.model.save(model_path) def tuple2list(self, data): res = [] for el in data: res += el return res def format_data(self, data): temp1 = [] for ele in data: token = ele[0] label = ele[1] temp2 = [] for i in range(len(token)): temp2.append([label[i], token[i]]) temp1.append(temp2) return temp1 def train(self, trn_data: List[Tuple[List[str], List[str]]], dev_data: List[Tuple[List[str], List[str]]], *args, **kwargs): """ Trains the model. :param trn_data: the training data. :param dev_data: the development data. :param args: :param kwargs: :return: """ self.model = self.get_model() epochs = 80 for epoch in range(epochs): print("Epoch %d/%d" % (epoch, epochs)) a = Progbar(len(self.train_batch_len)) for i, batch in enumerate( self.get_mini_batch(self.train_batch, self.train_batch_len)): labels, tokens, casing, char = batch self.model.train_on_batch([tokens, casing, char], labels) a.update(i) a.update(i + 1) print(' ') # model.save("hw3-model") save_data = [ self.word_embeddings, self.case_embeddings, self.idx2Label, self.word_idx, self.word_idx, self.label_idx, self.case_idx, self.char_idx ] with open(os.path.join(resource_dir, 'pickle'), 'wb') as handle: pickle.dump(save_data, handle) def dev_evaluate(self, model, data): correctLabels = [] predLabels = [] b = Progbar(len(data)) for i, data1 in enumerate(data): tokens, casing, char, labels = data1 tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) # Predict the classes correctLabels.append(labels) predLabels.append(pred) b.update(i) b.update(i + 1) label_pred = [] for sentence in predLabels: label_pred.append( [self.idx2Label[element] for element in sentence]) label_correct = [] for sentence in correctLabels: label_correct.append( [self.idx2Label[element] for element in sentence]) acc = ChunkF1() for pred, label in zip(label_pred, label_correct): acc.update(pred, label) print(float(acc.get()[1])) def decode(self, data: List[Tuple[List[str], List[str]]], **kwargs) -> List[List[str]]: """ :param data: :param kwargs: :return: the list of predicted labels. """ with open(os.path.join(self.resource_dir, 'pickle'), 'rb') as handle: save_data = pickle.load(handle) self.word_embeddings, self.case_embeddings, self.idx2Label, self.word_idx, self.word_idx, self.label_idx, self.case_idx, self.char_idx = save_data dataset = self.prepare_data(data) model = self.load(os.path.join(self.resource_dir, 'hw3-model')) correctLabels = [] predLabels = [] b = Progbar(len(dataset)) for i, data1 in enumerate(dataset): tokens, casing, char, labels = data1 tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) # Predict the classes correctLabels.append(labels) predLabels.append(pred) b.update(i) b.update(i + 1) label_pred = [] for sentence in predLabels: label_pred.append( [self.idx2Label[element] for element in sentence]) label_correct = [] for sentence in correctLabels: label_correct.append( [self.idx2Label[element] for element in sentence]) return label_pred, label_correct def evaluate(self, data: List[Tuple[List[str], List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ labels = [y for y, _ in data] preds, labels = self.decode(data) # print(preds) # print(labels) acc = ChunkF1() for pred, label in zip(preds, labels): acc.update(pred, label) # print(float(acc.get()[1])) return float(acc.get()[1])
class SentimentAnalyzer(Component): def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) # print(os.path.join(resource_dir, 'sst.trn.tsv')) # self.train = pd.read_csv(os.path.join(resource_dir, 'sst.trn.tsv'), sep='\t') # TODO: to be filled. self.net = Net() self.resource_dir = os.environ.get('RESOURCE') def pad_x(self, trn_xs): max_len = 0 trn_xs = list(trn_xs) for i in trn_xs: if len(i) > max_len: max_len = len(i) max_len = 61 for i in range(len(trn_xs)): if len(trn_xs[i]) <= max_len: # temp = np.zeros(50) temp = [np.zeros(50) for _ in range(max_len - len(trn_xs[i]))] trn_xs[i] = trn_xs[i] + temp else: trn_xs[i] = trn_xs[i][0:max_len] trn_xs = tuple(trn_xs) return trn_xs def load1(self, model_path: str, **kwargs): """ Load the pre-trained model. :param model_path: :param kwargs: """ # TODO: to be filled model_name = kwargs['name'] dir = os.path.join(model_path, model_name) model = torch.load(dir) # print(model) return model def load(self, model_path: str, **kwargs): """ Load the pre-trained model. :param model_path: :param kwargs: """ # TODO: to be filled # model_name = kwargs['name']+'.plk' model = Net() dir = model_path model.load_state_dict(torch.load(dir)) # print(model) return model def save(self, model_path: str, **kwargs): """ Saves the current model to the path. :param model_path: :param kwargs: """ # TODO: to be filled model = self.net dir = model_path # the_model.state_dict() torch.save(model.state_dict(), dir) def save1(self, model_path: str, **kwargs): model = kwargs['model'] model_name = kwargs['name'] dir = os.path.join(model_path, model_name) torch.save(model, dir) def get_tensor_data(self, dev_xs): result = [] for i in dev_xs: x = np.array(i) # x = torch.from_numpy(x) result.append(x) result = np.array(result) dev_xs = torch.FloatTensor(result) dev_xs = dev_xs.unsqueeze(1) return dev_xs def plot_fig(self, x1, x2, epoch): fig, ax = plt.subplots() ax.plot(x1, label='training') ax.plot(x2, label='validation') ax.set(xlabel='epoch', ylabel='accuracy', title='model accuracy') ax.grid() name = 'model' + str(epoch) + '.png' dir = os.path.join(self.resource_dir, name) plt.legend() fig.savefig(dir) # plt.show() def train(self, trn_data: List[Tuple[int, List[str]]], dev_data: List[Tuple[int, List[str]]], *args, **kwargs): """ Trains the model. :param trn_data: the training data. :param dev_data: the development data. :param args: :param kwargs: :return: """ trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data]) trn_xs = self.pad_x(trn_xs) dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data]) dev_xs = self.pad_x(dev_xs) train_data = MyDataset(trn_xs, trn_ys) vali_data = MyDataset(dev_xs, dev_ys) dev_xs = self.get_tensor_data(dev_xs) dev_ys = list(dev_ys) train_loader = Data.DataLoader(dataset=train_data, batch_size=64, shuffle=True) # vali_loader = Data.DataLoader(dataset=vali_data) # TODO: to be filled net = self.net criterion = nn.CrossEntropyLoss() # criterion = nn.MultiLabelSoftMarginLoss() optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9) total_epoch = 22 train_acc = [] vali_acc = [] for epoch in range(total_epoch): for i, data in enumerate(train_loader): inputs, labels = data optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics # if i % 20 == 19: # print every 2000 mini-batches vali_output = net(dev_xs) vali_pred_y = torch.max(vali_output, 1)[1].data.numpy() vali_accuracy = float( (vali_pred_y == dev_ys).astype(int).sum()) / float(len(dev_ys)) train_output = net(inputs) train_pred_y = torch.max(train_output, 1)[1].data.numpy() train_accuracy = float( (train_pred_y == labels.tolist()).astype(int).sum()) / float( len(labels)) # print('epoch:', epoch, 'of', total_epoch ,'| train loss: %.4f' % loss.data.numpy(),'| validation accuracy: %.4f' % vali_accuracy) print('epoch:', epoch, 'of', total_epoch, '| train accuracy: %.4f' % train_accuracy, '| validation accuracy: %.4f' % vali_accuracy) train_acc.append(train_accuracy) vali_acc.append(vali_accuracy) self.plot_fig(train_acc, vali_acc, epoch) self.net = net self.save(os.path.join(self.resource_dir, 'hw2-model')) def decode(self, data: List[Tuple[int, List[str]]], **kwargs) -> List[int]: """ :param data: :param kwargs: :return: the list of predicted labels. """ xs = [self.vsm.emb_list(x) for _, x in data] xs = self.pad_x(xs) inputs = self.get_tensor_data(xs) model = self.load(os.path.join(self.resource_dir, 'hw2-model')) outputs = model(inputs) pred_y = torch.max(outputs, 1)[1].data.numpy() return pred_y # TODO: to be filled def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ gold_labels = [y for y, _ in data] auto_labels = self.decode(data) total = correct = 0 for gold, auto in zip(gold_labels, auto_labels): if gold == auto: correct += 1 total += 1 # print(100.0 * correct / total) return 100.0 * correct / total
class SentimentAnalyzer(Component): def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) # TODO: to be filled. def load(self, model_path: str, **kwargs): """ Load the pre-trained model. :param model_path: :param kwargs: """ # TODO: to be filled pass def save(self, model_path: str, **kwargs): """ Saves the current model to the path. :param model_path: :param kwargs: """ # TODO: to be filled pass def train(self, trn_data: List[Tuple[int, List[str]]], dev_data: List[Tuple[int, List[str]]], *args, **kwargs): """ Trains the model. :param trn_data: the training data. :param dev_data: the development data. :param args: :param kwargs: :return: """ trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data]) dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data]) # TODO: to be filled pass def decode(self, data: List[Tuple[int, List[str]]], **kwargs) -> List[int]: """ :param data: :param kwargs: :return: the list of predicted labels. """ xs = [self.vsm.emb_list(x) for _, x in data] # TODO: to be filled def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ gold_labels = [y for y, _ in data] auto_labels = self.decode(data) total = correct = 0 for gold, auto in zip(gold_labels, auto_labels): if gold == auto: correct += 1 total += 1 return 100.0 * correct / total
class SentimentAnalyzer(Component): def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) # TODO: to be filled. def load(self, model_path: str, **kwargs): """ Load the pre-trained model. :param model_path: :param kwargs: """ # TODO: to be filled json_file = open(model_path + '/model.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) # load weights into new model self.model.load_weights(model_path + "/model.h5") print("Loaded model from disk") # pass def save(self, model_path: str, **kwargs): """ Saves the current model to the path. :param model_path: :param kwargs: """ # TODO: to be filled # serialize model to JSON # make sure directory exist if not os.path.exists(model_path): os.makedirs(model_path) model_json = self.model.to_json() with open(model_path + "/model.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 self.model.save_weights(model_path + "/model.h5") print("Saved model to disk") # pass def padding_training(self, trn_xs, max_sentence_length = 80): blank_embedding = self.vsm.emb_list(' ')[0] train_xs = [] for line in trn_xs: padding = max_sentence_length - len(line) for i in range(0, padding): line.append(blank_embedding) train_xs.append(line) train_xs = np.array(train_xs) train_xs = train_xs.reshape(train_xs.shape[0], train_xs.shape[1], train_xs.shape[2], 1) return train_xs def train(self, trn_data: List[Tuple[int, List[str]]], dev_data: List[Tuple[int, List[str]]], *args, **kwargs): """ Trains the model. :param trn_data: the training data. :param dev_data: the development data. :param args: :param kwargs: :return: """ trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data]) dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data]) # TODO: to be filled # generate label vector number_of_classes = 5 Y_train = np_utils.to_categorical(trn_ys, number_of_classes) Y_dev = np_utils.to_categorical(dev_ys, number_of_classes) # padding the sentence and generate training/developing dataset train_xs = self.padding_training(trn_xs) devlop_xs = self.padding_training(dev_xs) #Define the model first_ksize = 3 second_ksize = 4 third_ksize = 5 max_sentence_length = 80 embedding_dim = train_xs.shape[2] # instantiate regularizer reg = l2(0.15) image_input = Input(shape=(max_sentence_length,embedding_dim, 1)) first_kernel = Conv2D(64, (first_ksize, embedding_dim),strides=(1, 1),padding='valid', activation = 'relu')(image_input) first_kernel = MaxPooling2D(pool_size=(max_sentence_length-first_ksize+1, 1), strides=(1,1), padding='valid')(first_kernel) first_kernel = Flatten()(first_kernel) # first_kernel second_kernel = Conv2D(64, (second_ksize, embedding_dim),strides=(1, 1),padding='valid', activation = 'relu')(image_input) second_kernel = MaxPooling2D(pool_size=(max_sentence_length-second_ksize+1, 1), strides=(1,1), padding='valid')(second_kernel) second_kernel = Flatten()(second_kernel) # second_kernel third_kernel = Conv2D(64, (third_ksize, embedding_dim),strides=(1, 1),padding='valid', activation = 'relu')(image_input) third_kernel = MaxPooling2D(pool_size=(max_sentence_length-third_ksize+1, 1), strides=(1,1), padding='valid')(third_kernel) third_kernel = Flatten()(third_kernel) # third_kernel merged = concatenate([first_kernel, second_kernel, third_kernel]) merged = Dropout(0.5)(merged) output = Dense(5, activation='softmax', activity_regularizer=reg)(merged) self.model = Model(inputs=[image_input], outputs=output) # compile the model self.model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) # batch input gen = ImageDataGenerator() test_gen = ImageDataGenerator() train_generator = gen.flow(train_xs, Y_train, batch_size=50) test_generator = test_gen.flow(devlop_xs,Y_dev, batch_size = 50) # fit the model self.model.fit_generator(train_generator, steps_per_epoch=train_xs.shape[0]//50, epochs=15, validation_data=test_generator, validation_steps=devlop_xs.shape[0]//50) # pass def decode(self, data: List[Tuple[int, List[str]]], **kwargs) -> List[int]: """ :param data: :param kwargs: :return: the list of predicted labels. """ xs = [self.vsm.emb_list(x) for _, x in data] # TODO: to be filled padding_xs = self.padding_training(xs) pred = self.model.predict(padding_xs) y_classes = pred.argmax(axis=-1) return y_classes def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ gold_labels = [y for y, _ in data] auto_labels = self.decode(data) total = correct = 0 for gold, auto in zip(gold_labels, auto_labels): if gold == auto: correct += 1 total += 1 print("accuracy") print(100.0 * correct / total) return 100.0 * correct / total
class NamedEntityRecognizer(Component): def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) # TODO: to be filled. def load(self, model_path: str, **kwargs): """ Load the pre-trained model. :param model_path: :param kwargs: """ # TODO: to be filled pass def save(self, model_path: str, **kwargs): """ Saves the current model to the path. :param model_path: :param kwargs: """ # TODO: to be filled pass def train(self, trn_data: List[Tuple[List[str], List[str]]], dev_data: List[Tuple[List[str], List[str]]], *args, **kwargs): """ Trains the model. :param trn_data: the training data. :param dev_data: the development data. :param args: :param kwargs: :return: """ trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data]) dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data]) # TODO: to be filled pass def decode(self, data: List[Tuple[List[str], List[str]]], **kwargs) -> List[List[str]]: """ :param data: :param kwargs: :return: the list of predicted labels. """ xs = [self.vsm.emb_list(x) for _, x in data] # TODO: to be filled def evaluate(self, data: List[Tuple[List[str], List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ preds = self.decode(data) labels = [y for y, _ in data] acc = ChunkF1() for pred, label in zip(preds, labels): acc.update(pred, label) return float(acc.get()[1])