token_label.append(self.idx2Label[l]) padding_labels.append(token_label) # y_classes = pred.argmax(axis=-1) return padding_labels def evaluate(self, data: List[Tuple[List[str], List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ preds = self.decode(data) labels = [y for y, _ in data] acc = ChunkF1() for pred, label in zip(preds, labels): acc.update(pred, label) return float(acc.get()[1]) if __name__ == '__main__': resource_dir = os.environ.get('RESOURCE') sentiment_analyzer = NamedEntityRecognizer(resource_dir) trn_data = tsv_reader(resource_dir, 'conll03.eng.trn.tsv') dev_data = tsv_reader(resource_dir, 'conll03.eng.dev.tsv') tst_data = tsv_reader(resource_dir, 'conll03.eng.tst.tsv') sentiment_analyzer.train(trn_data, dev_data) sentiment_analyzer.evaluate(tst_data) sentiment_analyzer.save(os.path.join(resource_dir, 'hw3-model'))
def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'): """ Initializes all resources and the model. :param resource_dir: a path to the directory where resource files are located. """ self.vsm = FastText(os.path.join(resource_dir, embedding_file)) self.resource_dir = resource_dir trn_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.trn.tsv')) dev_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.dev.tsv')) tst_data = self.format_data( tsv_reader(resource_dir, 'conll03.eng.tst.tsv')) token_dic = {} for sentences in trn_data + dev_data + tst_data: for words in sentences: token = words[0] token_dic[token] = True tokens = list(token_dic.keys()) tokens_emb = self.vsm.emb_list(tokens) trn_sentence = self.get_char_inform(trn_data) dev_sentence = self.get_char_inform(dev_data) tst_sentence = self.get_char_inform(tst_data) ## parepare labe and words label_set = set() words = {} for dataset in [trn_sentence, dev_sentence, tst_sentence]: for sentence in dataset: for token, char, label in sentence: if label != 'XX': label_set.add(label) words[token.lower()] = True ## label index label_idx = {} for label in label_set: label_idx[label] = len(label_idx) self.label_idx = label_idx ## case index and case embedding case_idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } self.case_embeddings = np.identity(len(case_idx), dtype='float32') self.case_idx = case_idx ## word to index and word embedding word_idx = {} word_embeddings = [] df = pd.DataFrame([tokens, tokens_emb]) combine_embeddings = df.T.values.tolist() # for line in combine_embeddings: for i in range(len(combine_embeddings)): split = combine_embeddings[i] word = split[0] if len(word_idx) == 0: word_idx["PADDING_TOKEN"] = len(word_idx) vector = np.zeros(len(split[1])) word_embeddings.append(vector) word_idx["UNKNOWN_TOKEN"] = len(word_idx) vector = np.random.uniform(-0.25, 0.25, len(split[1])) word_embeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1]]) word_embeddings.append(vector) word_idx[split[0]] = len(word_idx) self.word_idx = word_idx self.word_embeddings = np.array(word_embeddings) ## char index char_idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char_idx[c] = len(char_idx) self.char_idx = char_idx ## prepare dataset train_set = self.padding( self.get_embedded_data(trn_sentence, word_idx, label_idx, case_idx, char_idx)) dev_set = self.padding( self.get_embedded_data(dev_sentence, word_idx, label_idx, case_idx, char_idx)) test_set = self.padding( self.get_embedded_data(tst_sentence, word_idx, label_idx, case_idx, char_idx)) self.idx2Label = {v: k for k, v in label_idx.items()} self.train_batch, self.train_batch_len = self.get_batch(train_set) self.dev_batch, self.dev_batch_len = self.get_batch(dev_set) self.test_batch, self.test_batch_len = self.get_batch(test_set)
import os from time import time from src.hw2 import SentimentAnalyzer from src.util import tsv_reader if __name__ == '__main__': resource_dir = os.environ.get('RESOURCE') tst_data = tsv_reader(resource_dir, 'sst.tst.tsv') start = time() sentiment_analyzer = SentimentAnalyzer(resource_dir) sentiment_analyzer.load(os.path.join(resource_dir, 'hw2-model')) score = sentiment_analyzer.evaluate(tst_data) end = time() print(score, end - start)
# TODO: to be filled def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float: """ :param data: :param kwargs: :return: the accuracy of this model. """ gold_labels = [y for y, _ in data] auto_labels = self.decode(data) total = correct = 0 for gold, auto in zip(gold_labels, auto_labels): if gold == auto: correct += 1 total += 1 # print(100.0 * correct / total) return 100.0 * correct / total if __name__ == '__main__': resource_dir = os.environ.get('RESOURCE') # resource_dir = '../res/' sentiment_analyzer = SentimentAnalyzer(resource_dir) trn_data = tsv_reader(resource_dir, 'sst.trn.tsv') dev_data = tsv_reader(resource_dir, 'sst.dev.tsv') tst_data = tsv_reader(resource_dir, 'sst.tst.tsv') sentiment_analyzer.train(trn_data, dev_data) sentiment_analyzer.evaluate(tst_data) sentiment_analyzer.save(os.path.join(resource_dir, 'hw2-model')) sentiment_analyzer.load(os.path.join(resource_dir, 'hw2-model'))
import os from time import time from src.hw3 import NamedEntityRecognizer from src.util import tsv_reader if __name__ == '__main__': resource_dir = os.environ.get('RESOURCE') tst_data = tsv_reader(resource_dir, 'conll03.eng.tst.tsv') start = time() named_entity_recognizer = NamedEntityRecognizer(resource_dir) named_entity_recognizer.load(os.path.join(resource_dir, 'hw3-model')) score = named_entity_recognizer.evaluate(tst_data) end = time() print(score, end - start)