Esempio n. 1
0
                    token_label.append(self.idx2Label[l])
            padding_labels.append(token_label)

        # y_classes = pred.argmax(axis=-1)
        return padding_labels

    def evaluate(self, data: List[Tuple[List[str], List[str]]],
                 **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """
        preds = self.decode(data)
        labels = [y for y, _ in data]
        acc = ChunkF1()
        for pred, label in zip(preds, labels):
            acc.update(pred, label)
        return float(acc.get()[1])


if __name__ == '__main__':
    resource_dir = os.environ.get('RESOURCE')
    sentiment_analyzer = NamedEntityRecognizer(resource_dir)
    trn_data = tsv_reader(resource_dir, 'conll03.eng.trn.tsv')
    dev_data = tsv_reader(resource_dir, 'conll03.eng.dev.tsv')
    tst_data = tsv_reader(resource_dir, 'conll03.eng.tst.tsv')
    sentiment_analyzer.train(trn_data, dev_data)
    sentiment_analyzer.evaluate(tst_data)
    sentiment_analyzer.save(os.path.join(resource_dir, 'hw3-model'))
Esempio n. 2
0
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        self.resource_dir = resource_dir

        trn_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.trn.tsv'))
        dev_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.dev.tsv'))
        tst_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.tst.tsv'))

        token_dic = {}
        for sentences in trn_data + dev_data + tst_data:
            for words in sentences:
                token = words[0]
                token_dic[token] = True

        tokens = list(token_dic.keys())
        tokens_emb = self.vsm.emb_list(tokens)

        trn_sentence = self.get_char_inform(trn_data)
        dev_sentence = self.get_char_inform(dev_data)
        tst_sentence = self.get_char_inform(tst_data)

        ## parepare labe and words
        label_set = set()
        words = {}
        for dataset in [trn_sentence, dev_sentence, tst_sentence]:
            for sentence in dataset:
                for token, char, label in sentence:
                    if label != 'XX':
                        label_set.add(label)
                        words[token.lower()] = True

        ## label index
        label_idx = {}
        for label in label_set:
            label_idx[label] = len(label_idx)
        self.label_idx = label_idx

        ## case index and case embedding
        case_idx = {
            'numeric': 0,
            'allLower': 1,
            'allUpper': 2,
            'initialUpper': 3,
            'other': 4,
            'mainly_numeric': 5,
            'contains_digit': 6,
            'PADDING_TOKEN': 7
        }
        self.case_embeddings = np.identity(len(case_idx), dtype='float32')
        self.case_idx = case_idx

        ## word to index and word embedding
        word_idx = {}
        word_embeddings = []

        df = pd.DataFrame([tokens, tokens_emb])
        combine_embeddings = df.T.values.tolist()

        # for line in combine_embeddings:
        for i in range(len(combine_embeddings)):
            split = combine_embeddings[i]
            word = split[0]

            if len(word_idx) == 0:
                word_idx["PADDING_TOKEN"] = len(word_idx)
                vector = np.zeros(len(split[1]))
                word_embeddings.append(vector)
                word_idx["UNKNOWN_TOKEN"] = len(word_idx)
                vector = np.random.uniform(-0.25, 0.25, len(split[1]))
                word_embeddings.append(vector)

            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1]])
                word_embeddings.append(vector)
                word_idx[split[0]] = len(word_idx)

        self.word_idx = word_idx
        self.word_embeddings = np.array(word_embeddings)

        ## char index
        char_idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
            char_idx[c] = len(char_idx)

        self.char_idx = char_idx

        ## prepare dataset
        train_set = self.padding(
            self.get_embedded_data(trn_sentence, word_idx, label_idx, case_idx,
                                   char_idx))
        dev_set = self.padding(
            self.get_embedded_data(dev_sentence, word_idx, label_idx, case_idx,
                                   char_idx))
        test_set = self.padding(
            self.get_embedded_data(tst_sentence, word_idx, label_idx, case_idx,
                                   char_idx))

        self.idx2Label = {v: k for k, v in label_idx.items()}
        self.train_batch, self.train_batch_len = self.get_batch(train_set)
        self.dev_batch, self.dev_batch_len = self.get_batch(dev_set)
        self.test_batch, self.test_batch_len = self.get_batch(test_set)
Esempio n. 3
0
import os
from time import time

from src.hw2 import SentimentAnalyzer
from src.util import tsv_reader

if __name__ == '__main__':
    resource_dir = os.environ.get('RESOURCE')
    tst_data = tsv_reader(resource_dir, 'sst.tst.tsv')
    start = time()
    sentiment_analyzer = SentimentAnalyzer(resource_dir)
    sentiment_analyzer.load(os.path.join(resource_dir, 'hw2-model'))
    score = sentiment_analyzer.evaluate(tst_data)
    end = time()
    print(score, end - start)
Esempio n. 4
0
        # TODO: to be filled

    def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """
        gold_labels = [y for y, _ in data]
        auto_labels = self.decode(data)
        total = correct = 0
        for gold, auto in zip(gold_labels, auto_labels):
            if gold == auto:
                correct += 1
            total += 1
        # print(100.0 * correct / total)
        return 100.0 * correct / total


if __name__ == '__main__':
    resource_dir = os.environ.get('RESOURCE')
    # resource_dir = '../res/'
    sentiment_analyzer = SentimentAnalyzer(resource_dir)
    trn_data = tsv_reader(resource_dir, 'sst.trn.tsv')
    dev_data = tsv_reader(resource_dir, 'sst.dev.tsv')
    tst_data = tsv_reader(resource_dir, 'sst.tst.tsv')
    sentiment_analyzer.train(trn_data, dev_data)
    sentiment_analyzer.evaluate(tst_data)
    sentiment_analyzer.save(os.path.join(resource_dir, 'hw2-model'))
    sentiment_analyzer.load(os.path.join(resource_dir, 'hw2-model'))
Esempio n. 5
0
import os
from time import time

from src.hw3 import NamedEntityRecognizer
from src.util import tsv_reader

if __name__ == '__main__':
    resource_dir = os.environ.get('RESOURCE')
    tst_data = tsv_reader(resource_dir, 'conll03.eng.tst.tsv')
    start = time()
    named_entity_recognizer = NamedEntityRecognizer(resource_dir)
    named_entity_recognizer.load(os.path.join(resource_dir, 'hw3-model'))
    score = named_entity_recognizer.evaluate(tst_data)
    end = time()
    print(score, end - start)