コード例 #1
0
 def __init__(self,
              resource_dir: str,
              embedding_file='fasttext-50-180614.bin'):
     """
     Initializes all resources and the model.
     :param resource_dir: a path to the directory where resource files are located.
     """
     self.vsm = FastText(os.path.join(resource_dir, embedding_file))
コード例 #2
0
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))

        if os.path.exists(resource_dir + '/label2Idx.json'):
            with open(resource_dir + '/label2Idx.json') as fi:
                self.label2Idx = json.load(fi)

            self.idx2Label = {v: k for k, v in self.label2Idx.items()}
コード例 #3
0
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        # print(os.path.join(resource_dir, 'sst.trn.tsv'))
        # self.train = pd.read_csv(os.path.join(resource_dir, 'sst.trn.tsv'), sep='\t')

        # TODO: to be filled.
        self.net = Net()
        self.resource_dir = os.environ.get('RESOURCE')
コード例 #4
0
class NamedEntityRecognizer(Component):
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))

        if os.path.exists(resource_dir + '/label2Idx.json'):
            with open(resource_dir + '/label2Idx.json') as fi:
                self.label2Idx = json.load(fi)

            self.idx2Label = {v: k for k, v in self.label2Idx.items()}

        # TODO: to be filled.

    def load(self, model_path: str, **kwargs):
        """
        Load the pre-trained model.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        json_file = open(model_path + '/model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.model = model_from_json(loaded_model_json)
        # load weights into new model
        self.model.load_weights(model_path + "/model.h5")
        print("Loaded model from disk")
        # pass

    def save(self, model_path: str, **kwargs):
        """
        Saves the current model to the path.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_json = self.model.to_json()
        with open(model_path + "/model.json", "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        self.model.save_weights(model_path + "/model.h5")
        print("Saved model to disk")
        pass

    def train(self, trn_data: List[Tuple[List[str], List[str]]],
              dev_data: List[Tuple[List[str], List[str]]], *args, **kwargs):
        """
        Trains the model.
        :param trn_data: the training data.
        :param dev_data: the development data.
        :param args:
        :param kwargs:
        :return:
        """
        trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data])
        dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data])
        # TODO: to be filled

        labelSet = set()

        for dataset in [trn_ys, dev_ys]:
            for label_seq in dataset:
                for label in label_seq:
                    labelSet.add(label)

        self.label2Idx = {}
        for label in labelSet:
            self.label2Idx[label] = len(self.label2Idx)
        self.label2Idx['PAD'] = len(self.label2Idx)

        with open(resource_dir + '/label2Idx.json', 'w') as fo:
            json.dump(self.label2Idx, fo)

        self.idx2Label = {v: k for k, v in self.label2Idx.items()}

        Y_train = self.createMatrices(trn_ys, self.label2Idx)
        Y_dev = self.createMatrices(dev_ys, self.label2Idx)

        train_xs = self.padding_training(trn_xs)
        devlop_xs = self.padding_training(dev_xs)

        Y_train_padding = self.padding_training_Y(Y_train)
        Y_dev_padding = self.padding_training_Y(Y_dev)

        train_ys = [
            np_utils.to_categorical(i, num_classes=len(self.label2Idx))
            for i in Y_train_padding
        ]
        train_ys = np.asarray(train_ys)

        devlop_ys = [
            np_utils.to_categorical(i, num_classes=len(self.label2Idx))
            for i in Y_dev_padding
        ]
        devlop_ys = np.asarray(devlop_ys)

        max_sentence_length = 125
        embedding_dim = train_xs.shape[2]

        image_input = Input(shape=(max_sentence_length, embedding_dim))

        output = Bidirectional(
            LSTM(200,
                 return_sequences=True,
                 dropout=0.50,
                 recurrent_dropout=0.25))(image_input)
        output = TimeDistributed(Dense(len(label2Idx),
                                       activation='softmax'))(output)

        self.model = Model(inputs=[image_input], outputs=output)
        self.model.compile(loss='categorical_crossentropy', optimizer='nadam')

        self.model.fit(train_xs,
                       train_ys,
                       batch_size=50,
                       epochs=15,
                       validation_data=(devlop_xs, devlop_ys))

    def createMatrices(self, sentences, label2Idx):
        dataset = []
        for sentence in sentences:
            labelIndices = []
            for label in sentence:
                labelIndices.append(self.label2Idx[label])

            dataset.append(labelIndices)
        return dataset

    def padding_training(self, trn_xs, max_sentence_length=125):

        blank_embedding = self.vsm.emb_list(' ')[0]
        train_xs = []
        for line in trn_xs:
            padding = max_sentence_length - len(line)
            for i in range(0, padding):
                line.append(blank_embedding)
            train_xs.append(line)

        train_xs = np.array(train_xs)
        train_xs = train_xs.reshape(train_xs.shape[0], train_xs.shape[1],
                                    train_xs.shape[2])
        return train_xs

    def padding_training_Y(self, Y_train, max_sentence_length=125):

        blank_embedding = len(self.label2Idx) - 1
        Y_train_padding = []
        for line in Y_train:
            padding = max_sentence_length - len(line)
            for i in range(0, padding):
                line.append(blank_embedding)
            Y_train_padding.append(line)

        Y_train_padding = np.array(Y_train_padding)
        return Y_train_padding

        # pass

    def decode(self, data: List[Tuple[List[str], List[str]]],
               **kwargs) -> List[str]:
        """
        :param data:
        :param kwargs:
        :return: the list of predicted labels.
        """
        xs = [self.vsm.emb_list(x) for _, x in data]
        # TODO: to be filled

        padding_xs = self.padding_training(xs)
        padding_pred = self.model.predict(padding_xs)

        padding_idx = np.argmax(padding_pred, axis=2)
        padding_labels = []
        for sentences in padding_idx:
            token_label = []
            for l in sentences:
                if l != len(self.label2Idx) - 1:
                    token_label.append(self.idx2Label[l])
            padding_labels.append(token_label)

        # y_classes = pred.argmax(axis=-1)
        return padding_labels

    def evaluate(self, data: List[Tuple[List[str], List[str]]],
                 **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """
        preds = self.decode(data)
        labels = [y for y, _ in data]
        acc = ChunkF1()
        for pred, label in zip(preds, labels):
            acc.update(pred, label)
        return float(acc.get()[1])
コード例 #5
0
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        self.resource_dir = resource_dir

        trn_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.trn.tsv'))
        dev_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.dev.tsv'))
        tst_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.tst.tsv'))

        token_dic = {}
        for sentences in trn_data + dev_data + tst_data:
            for words in sentences:
                token = words[0]
                token_dic[token] = True

        tokens = list(token_dic.keys())
        tokens_emb = self.vsm.emb_list(tokens)

        trn_sentence = self.get_char_inform(trn_data)
        dev_sentence = self.get_char_inform(dev_data)
        tst_sentence = self.get_char_inform(tst_data)

        ## parepare labe and words
        label_set = set()
        words = {}
        for dataset in [trn_sentence, dev_sentence, tst_sentence]:
            for sentence in dataset:
                for token, char, label in sentence:
                    if label != 'XX':
                        label_set.add(label)
                        words[token.lower()] = True

        ## label index
        label_idx = {}
        for label in label_set:
            label_idx[label] = len(label_idx)
        self.label_idx = label_idx

        ## case index and case embedding
        case_idx = {
            'numeric': 0,
            'allLower': 1,
            'allUpper': 2,
            'initialUpper': 3,
            'other': 4,
            'mainly_numeric': 5,
            'contains_digit': 6,
            'PADDING_TOKEN': 7
        }
        self.case_embeddings = np.identity(len(case_idx), dtype='float32')
        self.case_idx = case_idx

        ## word to index and word embedding
        word_idx = {}
        word_embeddings = []

        df = pd.DataFrame([tokens, tokens_emb])
        combine_embeddings = df.T.values.tolist()

        # for line in combine_embeddings:
        for i in range(len(combine_embeddings)):
            split = combine_embeddings[i]
            word = split[0]

            if len(word_idx) == 0:
                word_idx["PADDING_TOKEN"] = len(word_idx)
                vector = np.zeros(len(split[1]))
                word_embeddings.append(vector)
                word_idx["UNKNOWN_TOKEN"] = len(word_idx)
                vector = np.random.uniform(-0.25, 0.25, len(split[1]))
                word_embeddings.append(vector)

            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1]])
                word_embeddings.append(vector)
                word_idx[split[0]] = len(word_idx)

        self.word_idx = word_idx
        self.word_embeddings = np.array(word_embeddings)

        ## char index
        char_idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
            char_idx[c] = len(char_idx)

        self.char_idx = char_idx

        ## prepare dataset
        train_set = self.padding(
            self.get_embedded_data(trn_sentence, word_idx, label_idx, case_idx,
                                   char_idx))
        dev_set = self.padding(
            self.get_embedded_data(dev_sentence, word_idx, label_idx, case_idx,
                                   char_idx))
        test_set = self.padding(
            self.get_embedded_data(tst_sentence, word_idx, label_idx, case_idx,
                                   char_idx))

        self.idx2Label = {v: k for k, v in label_idx.items()}
        self.train_batch, self.train_batch_len = self.get_batch(train_set)
        self.dev_batch, self.dev_batch_len = self.get_batch(dev_set)
        self.test_batch, self.test_batch_len = self.get_batch(test_set)
コード例 #6
0
class NamedEntityRecognizer(Component):
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        self.resource_dir = resource_dir

        trn_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.trn.tsv'))
        dev_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.dev.tsv'))
        tst_data = self.format_data(
            tsv_reader(resource_dir, 'conll03.eng.tst.tsv'))

        token_dic = {}
        for sentences in trn_data + dev_data + tst_data:
            for words in sentences:
                token = words[0]
                token_dic[token] = True

        tokens = list(token_dic.keys())
        tokens_emb = self.vsm.emb_list(tokens)

        trn_sentence = self.get_char_inform(trn_data)
        dev_sentence = self.get_char_inform(dev_data)
        tst_sentence = self.get_char_inform(tst_data)

        ## parepare labe and words
        label_set = set()
        words = {}
        for dataset in [trn_sentence, dev_sentence, tst_sentence]:
            for sentence in dataset:
                for token, char, label in sentence:
                    if label != 'XX':
                        label_set.add(label)
                        words[token.lower()] = True

        ## label index
        label_idx = {}
        for label in label_set:
            label_idx[label] = len(label_idx)
        self.label_idx = label_idx

        ## case index and case embedding
        case_idx = {
            'numeric': 0,
            'allLower': 1,
            'allUpper': 2,
            'initialUpper': 3,
            'other': 4,
            'mainly_numeric': 5,
            'contains_digit': 6,
            'PADDING_TOKEN': 7
        }
        self.case_embeddings = np.identity(len(case_idx), dtype='float32')
        self.case_idx = case_idx

        ## word to index and word embedding
        word_idx = {}
        word_embeddings = []

        df = pd.DataFrame([tokens, tokens_emb])
        combine_embeddings = df.T.values.tolist()

        # for line in combine_embeddings:
        for i in range(len(combine_embeddings)):
            split = combine_embeddings[i]
            word = split[0]

            if len(word_idx) == 0:
                word_idx["PADDING_TOKEN"] = len(word_idx)
                vector = np.zeros(len(split[1]))
                word_embeddings.append(vector)
                word_idx["UNKNOWN_TOKEN"] = len(word_idx)
                vector = np.random.uniform(-0.25, 0.25, len(split[1]))
                word_embeddings.append(vector)

            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1]])
                word_embeddings.append(vector)
                word_idx[split[0]] = len(word_idx)

        self.word_idx = word_idx
        self.word_embeddings = np.array(word_embeddings)

        ## char index
        char_idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
            char_idx[c] = len(char_idx)

        self.char_idx = char_idx

        ## prepare dataset
        train_set = self.padding(
            self.get_embedded_data(trn_sentence, word_idx, label_idx, case_idx,
                                   char_idx))
        dev_set = self.padding(
            self.get_embedded_data(dev_sentence, word_idx, label_idx, case_idx,
                                   char_idx))
        test_set = self.padding(
            self.get_embedded_data(tst_sentence, word_idx, label_idx, case_idx,
                                   char_idx))

        self.idx2Label = {v: k for k, v in label_idx.items()}
        self.train_batch, self.train_batch_len = self.get_batch(train_set)
        self.dev_batch, self.dev_batch_len = self.get_batch(dev_set)
        self.test_batch, self.test_batch_len = self.get_batch(test_set)

    def get_word_embd(self):
        return self.word_embeddings

    def get_case_emb(self):
        return self.case_embeddings

    def get_char2index(self):
        return self.char_idx

    def prepare_data(self, data):
        data = self.format_data(data)
        sentences = self.get_char_inform(data)
        dataset = self.padding(
            self.get_embedded_data(sentences, self.word_idx, self.label_idx,
                                   self.case_idx, self.char_idx))
        batch, _ = self.get_batch(dataset)
        return batch

    def get_model(self):
        word_embeddings = self.word_embeddings
        case_embeddings = self.case_embeddings
        char_idx = self.char_idx
        label_idx = self.label_idx

        words_input = Input(shape=(None, ), dtype='int32', name='words_input')
        words = Embedding(input_dim=word_embeddings.shape[0],
                          output_dim=word_embeddings.shape[1],
                          weights=[word_embeddings],
                          trainable=False)(words_input)

        casing_input = Input(shape=(None, ),
                             dtype='int32',
                             name='casing_input')
        casing = Embedding(output_dim=case_embeddings.shape[1],
                           input_dim=case_embeddings.shape[0],
                           weights=[case_embeddings],
                           trainable=False)(casing_input)

        character_input = Input(shape=(
            None,
            52,
        ), name='char_input')
        embed_char_out = TimeDistributed(
            Embedding(len(char_idx),
                      30,
                      embeddings_initializer=RandomUniform(minval=-0.5,
                                                           maxval=0.5)),
            name='char_embedding')(character_input)

        dropout = Dropout(0.5)(embed_char_out)
        conv1d_out = TimeDistributed(
            Conv1D(kernel_size=3,
                   filters=30,
                   padding='same',
                   activation='tanh',
                   strides=1))(dropout)
        maxpool_out = TimeDistributed(MaxPooling1D(52))(conv1d_out)
        char = TimeDistributed(Flatten())(maxpool_out)
        char = Dropout(0.5)(char)

        output = concatenate([words, casing, char])
        output = Bidirectional(
            LSTM(200,
                 return_sequences=True,
                 dropout=0.50,
                 recurrent_dropout=0.25))(output)
        output = TimeDistributed(Dense(len(label_idx),
                                       activation='softmax'))(output)
        model = Model(inputs=[words_input, casing_input, character_input],
                      outputs=[output])
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='nadam')
        model.summary()
        return model

    def get_casing(self, word, case_tag):
        casing = 'other'

        num_digit = 0
        for char in word:
            if char.isdigit():
                num_digit += 1

        digit_frac = num_digit / float(len(word))
        if word.isdigit():
            casing = 'numeric'
        elif digit_frac > 0.5:
            casing = 'mainly_numeric'
        elif word.islower():
            casing = 'allLower'
        elif word.isupper():
            casing = 'allUpper'
        elif word[0].isupper():
            casing = 'initialUpper'
        elif num_digit > 0:
            casing = 'contains_digit'

        return case_tag[casing]

    def get_batch(self, data):
        l = []
        for i in data:
            l.append(len(i[0]))
        l = set(l)
        batches = []
        batch_len = []
        z = 0
        for i in l:
            for batch in data:
                if len(batch[0]) == i:
                    batches.append(batch)
                    z += 1
            batch_len.append(z)
        return batches, batch_len

    def get_embedded_data(self, sentences, word_idx, label_idx, case_idx,
                          char_idx):
        unknownIdx = word_idx['UNKNOWN_TOKEN']
        paddingIdx = word_idx['PADDING_TOKEN']
        dataset = []
        wordCount = 0
        unknownWordCount = 0

        for sentence in sentences:
            wordIndices = []
            caseIndices = []
            charIndices = []
            labelIndices = []
            for word, char, label in sentence:
                wordCount += 1
                if word in word_idx:
                    wordIdx = word_idx[word]
                elif word.lower() in word_idx:
                    wordIdx = word_idx[word.lower()]
                else:
                    wordIdx = unknownIdx
                    unknownWordCount += 1
                charIdx = []
                for x in char:
                    charIdx.append(char_idx[x])
                # Get the label and map to int
                wordIndices.append(wordIdx)
                caseIndices.append(self.get_casing(word, case_idx))
                charIndices.append(charIdx)
                if label != 'XX':
                    labelIndices.append(label_idx[label])

            dataset.append(
                [wordIndices, caseIndices, charIndices, labelIndices])

        return dataset

    def get_mini_batch(self, dataset, batch_len):
        start = 0
        for i in batch_len:
            tokens = []
            caseing = []
            char = []
            labels = []
            data = dataset[start:i]
            start = i
            for dt in data:
                t, c, ch, l = dt
                l = np.expand_dims(l, -1)
                tokens.append(t)
                caseing.append(c)
                char.append(ch)
                labels.append(l)
            yield np.asarray(labels), np.asarray(tokens), np.asarray(
                caseing), np.asarray(char)

    def get_char_inform(self, Sentences):
        for i, sentence in enumerate(Sentences):
            for j, data in enumerate(sentence):
                chars = [c for c in data[0]]
                Sentences[i][j] = [data[0], chars, data[1]]
        return Sentences

    def padding(self, Sentences):
        maxlen = 52
        for sentence in Sentences:
            char = sentence[2]
            for x in char:
                maxlen = max(maxlen, len(x))
        for i, sentence in enumerate(Sentences):
            Sentences[i][2] = pad_sequences(Sentences[i][2],
                                            52,
                                            padding='post')
        return Sentences

    def load(self, model_path: str, **kwargs):
        """
        Load the pre-trained model.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        self.model = load_model(model_path)
        return self.model

    def save(self, model_path: str, **kwargs):
        """
        Saves the current model to the path.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        self.model.save(model_path)

    def tuple2list(self, data):
        res = []
        for el in data:
            res += el
        return res

    def format_data(self, data):
        temp1 = []
        for ele in data:
            token = ele[0]
            label = ele[1]
            temp2 = []
            for i in range(len(token)):
                temp2.append([label[i], token[i]])
            temp1.append(temp2)
        return temp1

    def train(self, trn_data: List[Tuple[List[str], List[str]]],
              dev_data: List[Tuple[List[str], List[str]]], *args, **kwargs):
        """
        Trains the model.
        :param trn_data: the training data.
        :param dev_data: the development data.
        :param args:
        :param kwargs:
        :return:
        """

        self.model = self.get_model()

        epochs = 80
        for epoch in range(epochs):
            print("Epoch %d/%d" % (epoch, epochs))
            a = Progbar(len(self.train_batch_len))
            for i, batch in enumerate(
                    self.get_mini_batch(self.train_batch,
                                        self.train_batch_len)):
                labels, tokens, casing, char = batch
                self.model.train_on_batch([tokens, casing, char], labels)
                a.update(i)
            a.update(i + 1)
            print(' ')

        # model.save("hw3-model")
        save_data = [
            self.word_embeddings, self.case_embeddings, self.idx2Label,
            self.word_idx, self.word_idx, self.label_idx, self.case_idx,
            self.char_idx
        ]

        with open(os.path.join(resource_dir, 'pickle'), 'wb') as handle:
            pickle.dump(save_data, handle)

    def dev_evaluate(self, model, data):

        correctLabels = []
        predLabels = []
        b = Progbar(len(data))
        for i, data1 in enumerate(data):
            tokens, casing, char, labels = data1
            tokens = np.asarray([tokens])
            casing = np.asarray([casing])
            char = np.asarray([char])
            pred = model.predict([tokens, casing, char], verbose=False)[0]
            pred = pred.argmax(axis=-1)  # Predict the classes
            correctLabels.append(labels)
            predLabels.append(pred)
            b.update(i)
        b.update(i + 1)

        label_pred = []
        for sentence in predLabels:
            label_pred.append(
                [self.idx2Label[element] for element in sentence])
        label_correct = []
        for sentence in correctLabels:
            label_correct.append(
                [self.idx2Label[element] for element in sentence])

        acc = ChunkF1()
        for pred, label in zip(label_pred, label_correct):
            acc.update(pred, label)

        print(float(acc.get()[1]))

    def decode(self, data: List[Tuple[List[str], List[str]]],
               **kwargs) -> List[List[str]]:
        """
        :param data:
        :param kwargs:
        :return: the list of predicted labels.
        """

        with open(os.path.join(self.resource_dir, 'pickle'), 'rb') as handle:
            save_data = pickle.load(handle)

        self.word_embeddings, self.case_embeddings, self.idx2Label, self.word_idx, self.word_idx, self.label_idx, self.case_idx, self.char_idx = save_data

        dataset = self.prepare_data(data)
        model = self.load(os.path.join(self.resource_dir, 'hw3-model'))

        correctLabels = []
        predLabels = []
        b = Progbar(len(dataset))
        for i, data1 in enumerate(dataset):
            tokens, casing, char, labels = data1
            tokens = np.asarray([tokens])
            casing = np.asarray([casing])
            char = np.asarray([char])
            pred = model.predict([tokens, casing, char], verbose=False)[0]
            pred = pred.argmax(axis=-1)  # Predict the classes
            correctLabels.append(labels)
            predLabels.append(pred)
            b.update(i)
        b.update(i + 1)
        label_pred = []

        for sentence in predLabels:
            label_pred.append(
                [self.idx2Label[element] for element in sentence])
        label_correct = []
        for sentence in correctLabels:
            label_correct.append(
                [self.idx2Label[element] for element in sentence])
        return label_pred, label_correct

    def evaluate(self, data: List[Tuple[List[str], List[str]]],
                 **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """

        labels = [y for y, _ in data]
        preds, labels = self.decode(data)
        # print(preds)
        # print(labels)

        acc = ChunkF1()
        for pred, label in zip(preds, labels):
            acc.update(pred, label)

        # print(float(acc.get()[1]))
        return float(acc.get()[1])
コード例 #7
0
class SentimentAnalyzer(Component):
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        # print(os.path.join(resource_dir, 'sst.trn.tsv'))
        # self.train = pd.read_csv(os.path.join(resource_dir, 'sst.trn.tsv'), sep='\t')

        # TODO: to be filled.
        self.net = Net()
        self.resource_dir = os.environ.get('RESOURCE')

    def pad_x(self, trn_xs):
        max_len = 0
        trn_xs = list(trn_xs)
        for i in trn_xs:
            if len(i) > max_len:
                max_len = len(i)

        max_len = 61

        for i in range(len(trn_xs)):
            if len(trn_xs[i]) <= max_len:
                # temp = np.zeros(50)
                temp = [np.zeros(50) for _ in range(max_len - len(trn_xs[i]))]
                trn_xs[i] = trn_xs[i] + temp
            else:
                trn_xs[i] = trn_xs[i][0:max_len]
        trn_xs = tuple(trn_xs)
        return trn_xs

    def load1(self, model_path: str, **kwargs):
        """
        Load the pre-trained model.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        model_name = kwargs['name']
        dir = os.path.join(model_path, model_name)
        model = torch.load(dir)
        # print(model)
        return model

    def load(self, model_path: str, **kwargs):
        """
        Load the pre-trained model.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        # model_name = kwargs['name']+'.plk'
        model = Net()
        dir = model_path
        model.load_state_dict(torch.load(dir))
        # print(model)
        return model

    def save(self, model_path: str, **kwargs):
        """
        Saves the current model to the path.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        model = self.net
        dir = model_path
        # the_model.state_dict()
        torch.save(model.state_dict(), dir)

    def save1(self, model_path: str, **kwargs):

        model = kwargs['model']
        model_name = kwargs['name']
        dir = os.path.join(model_path, model_name)

        torch.save(model, dir)

    def get_tensor_data(self, dev_xs):
        result = []
        for i in dev_xs:
            x = np.array(i)
            # x = torch.from_numpy(x)
            result.append(x)
        result = np.array(result)
        dev_xs = torch.FloatTensor(result)
        dev_xs = dev_xs.unsqueeze(1)
        return dev_xs

    def plot_fig(self, x1, x2, epoch):
        fig, ax = plt.subplots()
        ax.plot(x1, label='training')
        ax.plot(x2, label='validation')
        ax.set(xlabel='epoch', ylabel='accuracy', title='model accuracy')
        ax.grid()

        name = 'model' + str(epoch) + '.png'
        dir = os.path.join(self.resource_dir, name)
        plt.legend()
        fig.savefig(dir)
        # plt.show()

    def train(self, trn_data: List[Tuple[int, List[str]]],
              dev_data: List[Tuple[int, List[str]]], *args, **kwargs):
        """
        Trains the model.
        :param trn_data: the training data.
        :param dev_data: the development data.
        :param args:
        :param kwargs:
        :return:
        """
        trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data])

        trn_xs = self.pad_x(trn_xs)

        dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data])

        dev_xs = self.pad_x(dev_xs)

        train_data = MyDataset(trn_xs, trn_ys)
        vali_data = MyDataset(dev_xs, dev_ys)

        dev_xs = self.get_tensor_data(dev_xs)
        dev_ys = list(dev_ys)

        train_loader = Data.DataLoader(dataset=train_data,
                                       batch_size=64,
                                       shuffle=True)
        # vali_loader = Data.DataLoader(dataset=vali_data)

        # TODO: to be filled

        net = self.net
        criterion = nn.CrossEntropyLoss()
        # criterion = nn.MultiLabelSoftMarginLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9)

        total_epoch = 22

        train_acc = []
        vali_acc = []
        for epoch in range(total_epoch):

            for i, data in enumerate(train_loader):

                inputs, labels = data
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # print statistics

                # if i % 20 == 19:  # print every 2000 mini-batches

            vali_output = net(dev_xs)
            vali_pred_y = torch.max(vali_output, 1)[1].data.numpy()
            vali_accuracy = float(
                (vali_pred_y == dev_ys).astype(int).sum()) / float(len(dev_ys))

            train_output = net(inputs)
            train_pred_y = torch.max(train_output, 1)[1].data.numpy()
            train_accuracy = float(
                (train_pred_y == labels.tolist()).astype(int).sum()) / float(
                    len(labels))

            # print('epoch:', epoch, 'of', total_epoch ,'| train loss: %.4f' % loss.data.numpy(),'| validation accuracy: %.4f' % vali_accuracy)
            print('epoch:', epoch, 'of', total_epoch,
                  '| train accuracy: %.4f' % train_accuracy,
                  '| validation accuracy: %.4f' % vali_accuracy)

            train_acc.append(train_accuracy)
            vali_acc.append(vali_accuracy)

        self.plot_fig(train_acc, vali_acc, epoch)
        self.net = net

        self.save(os.path.join(self.resource_dir, 'hw2-model'))

    def decode(self, data: List[Tuple[int, List[str]]], **kwargs) -> List[int]:
        """
        :param data:
        :param kwargs:
        :return: the list of predicted labels.
        """
        xs = [self.vsm.emb_list(x) for _, x in data]
        xs = self.pad_x(xs)
        inputs = self.get_tensor_data(xs)
        model = self.load(os.path.join(self.resource_dir, 'hw2-model'))
        outputs = model(inputs)
        pred_y = torch.max(outputs, 1)[1].data.numpy()
        return pred_y

        # TODO: to be filled

    def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """
        gold_labels = [y for y, _ in data]
        auto_labels = self.decode(data)
        total = correct = 0
        for gold, auto in zip(gold_labels, auto_labels):
            if gold == auto:
                correct += 1
            total += 1
        # print(100.0 * correct / total)
        return 100.0 * correct / total
コード例 #8
0
class SentimentAnalyzer(Component):
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        # TODO: to be filled.

    def load(self, model_path: str, **kwargs):
        """
        Load the pre-trained model.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        pass

    def save(self, model_path: str, **kwargs):
        """
        Saves the current model to the path.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        pass

    def train(self, trn_data: List[Tuple[int, List[str]]],
              dev_data: List[Tuple[int, List[str]]], *args, **kwargs):
        """
        Trains the model.
        :param trn_data: the training data.
        :param dev_data: the development data.
        :param args:
        :param kwargs:
        :return:
        """
        trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data])
        dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data])
        # TODO: to be filled
        pass

    def decode(self, data: List[Tuple[int, List[str]]], **kwargs) -> List[int]:
        """
        :param data:
        :param kwargs:
        :return: the list of predicted labels.
        """
        xs = [self.vsm.emb_list(x) for _, x in data]
        # TODO: to be filled

    def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """
        gold_labels = [y for y, _ in data]
        auto_labels = self.decode(data)
        total = correct = 0
        for gold, auto in zip(gold_labels, auto_labels):
            if gold == auto:
                correct += 1
            total += 1
        return 100.0 * correct / total
コード例 #9
0
ファイル: hw2.py プロジェクト: clin366/cs571
class SentimentAnalyzer(Component):
    def __init__(self, resource_dir: str, embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        # TODO: to be filled.

    def load(self, model_path: str, **kwargs):
        """
        Load the pre-trained model.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        json_file = open(model_path + '/model.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.model = model_from_json(loaded_model_json)
        # load weights into new model
        self.model.load_weights(model_path + "/model.h5")
        print("Loaded model from disk")
        # pass

    def save(self, model_path: str, **kwargs):
        """
        Saves the current model to the path.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        # serialize model to JSON
        # make sure directory exist
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_json = self.model.to_json()
        with open(model_path + "/model.json", "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        self.model.save_weights(model_path + "/model.h5")
        print("Saved model to disk")
        # pass

    def padding_training(self, trn_xs, max_sentence_length = 80):

        blank_embedding = self.vsm.emb_list(' ')[0]
        train_xs = []
        for line in trn_xs:
            padding = max_sentence_length - len(line)
            for i in range(0, padding):
                line.append(blank_embedding)
            train_xs.append(line)
            
        train_xs = np.array(train_xs)
        train_xs = train_xs.reshape(train_xs.shape[0], train_xs.shape[1], train_xs.shape[2], 1)
        return train_xs

    def train(self, trn_data: List[Tuple[int, List[str]]], dev_data: List[Tuple[int, List[str]]], *args, **kwargs):
        """
        Trains the model.
        :param trn_data: the training data.
        :param dev_data: the development data.
        :param args:
        :param kwargs:
        :return:
        """
        trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data])
        dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data])
        # TODO: to be filled

        # generate label vector
        number_of_classes = 5

        Y_train = np_utils.to_categorical(trn_ys, number_of_classes)
        Y_dev = np_utils.to_categorical(dev_ys, number_of_classes)

        # padding the sentence and generate training/developing dataset
        train_xs = self.padding_training(trn_xs)
        devlop_xs = self.padding_training(dev_xs)

        #Define the model
        first_ksize = 3
        second_ksize = 4
        third_ksize = 5
        max_sentence_length = 80
        embedding_dim = train_xs.shape[2]
        # instantiate regularizer
        reg = l2(0.15)
        image_input = Input(shape=(max_sentence_length,embedding_dim, 1))

        first_kernel = Conv2D(64, (first_ksize, embedding_dim),strides=(1, 1),padding='valid', activation = 'relu')(image_input)
        first_kernel = MaxPooling2D(pool_size=(max_sentence_length-first_ksize+1, 1), strides=(1,1), padding='valid')(first_kernel)
        first_kernel = Flatten()(first_kernel)
        # first_kernel 
        second_kernel = Conv2D(64, (second_ksize, embedding_dim),strides=(1, 1),padding='valid', activation = 'relu')(image_input)
        second_kernel = MaxPooling2D(pool_size=(max_sentence_length-second_ksize+1, 1), strides=(1,1), padding='valid')(second_kernel)
        second_kernel = Flatten()(second_kernel)
        # second_kernel
        third_kernel = Conv2D(64, (third_ksize, embedding_dim),strides=(1, 1),padding='valid', activation = 'relu')(image_input)
        third_kernel = MaxPooling2D(pool_size=(max_sentence_length-third_ksize+1, 1), strides=(1,1), padding='valid')(third_kernel)
        third_kernel = Flatten()(third_kernel)
        # third_kernel 
        merged = concatenate([first_kernel, second_kernel, third_kernel])
        merged = Dropout(0.5)(merged)
        output = Dense(5, activation='softmax', activity_regularizer=reg)(merged)

        self.model = Model(inputs=[image_input], outputs=output)
        # compile the model
        self.model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
        # batch input
        gen = ImageDataGenerator()
        test_gen = ImageDataGenerator()
        train_generator = gen.flow(train_xs, Y_train, batch_size=50)
        test_generator = test_gen.flow(devlop_xs,Y_dev, batch_size = 50)
        # fit the model
        self.model.fit_generator(train_generator, steps_per_epoch=train_xs.shape[0]//50, epochs=15, 
                            validation_data=test_generator, validation_steps=devlop_xs.shape[0]//50)

        # pass

    def decode(self, data: List[Tuple[int, List[str]]], **kwargs) -> List[int]:
        """
        :param data:
        :param kwargs:
        :return: the list of predicted labels.
        """
        xs = [self.vsm.emb_list(x) for _, x in data]
        # TODO: to be filled
        padding_xs = self.padding_training(xs)
        pred = self.model.predict(padding_xs)
        y_classes = pred.argmax(axis=-1)
        return y_classes

    def evaluate(self, data: List[Tuple[int, List[str]]], **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """
        gold_labels = [y for y, _ in data]
        auto_labels = self.decode(data)
        total = correct = 0
        for gold, auto in zip(gold_labels, auto_labels):
            if gold == auto:
                correct += 1
            total += 1
        print("accuracy")
        print(100.0 * correct / total)
        return 100.0 * correct / total
コード例 #10
0
ファイル: hw3.py プロジェクト: vdwanderley/cs571
class NamedEntityRecognizer(Component):
    def __init__(self,
                 resource_dir: str,
                 embedding_file='fasttext-50-180614.bin'):
        """
        Initializes all resources and the model.
        :param resource_dir: a path to the directory where resource files are located.
        """
        self.vsm = FastText(os.path.join(resource_dir, embedding_file))
        # TODO: to be filled.

    def load(self, model_path: str, **kwargs):
        """
        Load the pre-trained model.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        pass

    def save(self, model_path: str, **kwargs):
        """
        Saves the current model to the path.
        :param model_path:
        :param kwargs:
        """
        # TODO: to be filled
        pass

    def train(self, trn_data: List[Tuple[List[str], List[str]]],
              dev_data: List[Tuple[List[str], List[str]]], *args, **kwargs):
        """
        Trains the model.
        :param trn_data: the training data.
        :param dev_data: the development data.
        :param args:
        :param kwargs:
        :return:
        """
        trn_ys, trn_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in trn_data])
        dev_ys, dev_xs = zip(*[(y, self.vsm.emb_list(x)) for y, x in dev_data])
        # TODO: to be filled
        pass

    def decode(self, data: List[Tuple[List[str], List[str]]],
               **kwargs) -> List[List[str]]:
        """
        :param data:
        :param kwargs:
        :return: the list of predicted labels.
        """
        xs = [self.vsm.emb_list(x) for _, x in data]
        # TODO: to be filled

    def evaluate(self, data: List[Tuple[List[str], List[str]]],
                 **kwargs) -> float:
        """
        :param data:
        :param kwargs:
        :return: the accuracy of this model.
        """
        preds = self.decode(data)
        labels = [y for y, _ in data]
        acc = ChunkF1()
        for pred, label in zip(preds, labels):
            acc.update(pred, label)
        return float(acc.get()[1])