Esempio n. 1
0
 def test_fs2(self):
     r = FinkMos(x,
                 y,
                 tests=[f'f_10{x}' for x in range(8)],
                 tag_corpus=y_tags)
     print()
     print(r.to_feature_space2(3, 'DT', 'NNP', 'VBZ'))
Esempio n. 2
0
 def test_features(self):
     r = FinkMos(x,
                 y,
                 tests=[f'f_10{x}' for x in range(8)],
                 tag_corpus=y_tags)
     print()
     print(r.linear_loss())
Esempio n. 3
0
    def _vectorize(self):
        self.create_word2tag_subspace()

        self.fm = FinkMos(self.x, self.y, tag_corpus=self.tag_corpus)
        self.num_tests = len(self.fm.test_dict)
        self.fm.create_tuples()
        self.fm.create_feature_sparse_list_v2()
Esempio n. 4
0
 def test_model_function(self):
     tests = Features().get_tests().keys()
     model1 = Model(tests)
     model1.fit(x, y)
     fm = FinkMos(x, x, model1.tests, model1.tag_corpus)
     a = model1.model_function(1, 3, [2, 3], fm)
     print("model function result")
     print(a)
Esempio n. 5
0
 def test_normalize(self):
     x = pd.Series([
         '*', '*', 'The', 'Treasury', 'is', 'still', 'working', 'out',
         'the', 'details', 'with', 'bank', 'trade', 'associations', 'and',
         'the', 'other', 'government', 'agencies', 'that', 'have', 'a',
         'hand', 'in', 'fighting', "preencounte", 'word', '<STOP>'
     ])
     y = pd.Series([
         '*', '*', 'DT', 'NNP', 'VBZ', 'RB', 'VBG', 'RP', 'DT', 'NNS', 'IN',
         'NN', 'NN', 'NNS', 'CC', 'DT', 'JJ', 'NN', 'NNS', 'WDT', 'VBP',
         'DT', 'NN', 'IN', 'VBG', 'NN', 'Vt', '<STOP>'
     ])
     y_tags = y.unique()
     r = FinkMos(x,
                 y,
                 tests=[f'f_10{x}' for x in range(8)],
                 tag_corpus=y_tags)
     value = r.sentence_non_lineard_loss(np.zeros([8]))
     self.assertAlmostEqual(value, 72.0873, 3)
Esempio n. 6
0
    def test_create_tuples(self):
        data = PreprocessTags(True).load_data(r'..\data\train.wtag')
        word_num = 1_000
        tag_corp = pd.Series(data.y[0:word_num]).unique()
        # generate tests - (comment out if file is updated)
        feat_generator = Features()
        feat_generator.generate_tuple_corpus(data.x[0:word_num],
                                             data.y[0:word_num])
        for template in feat.templates_dict.values():
            feat_generator.generate_lambdas(template['func'],
                                            template['tuples'])
        feat_generator.save_tests()

        fm = FinkMos(data.x[0:word_num], data.y[0:word_num], tag_corp)
        fm.create_tuples()
        print("fm.weight_mat")
        print(fm.weight_mat)
        print("fm.tuple_5_list")
        print(fm.tuple_5_list)
        fm.create_feature_sparse_list_v2()
        # print(len(fm.f_matrix_list))
        print(fm.f_matrix_list[0].shape)
        fm.minimize_loss()
        fm.v.dump('values')
Esempio n. 7
0
class Model:
    def __init__(self):
        self.v = None  # TODO: init wisely ?
        self.x = None
        self.y = None
        self.num_tests = None
        self.vector_x_y = None
        self.tag_corpus = None
        self.tag_corpus_tokenized = None
        self.strin2token = dict()
        self.token2string = dict()
        self.word_tags = dict()

    def fit(self, x, y, learning_rate=0.02, x_val=None, y_val=None):
        """
        Fit will train the Model
            - Encoding
                - For the data will create a Tensor [ # todo Read more
            - Gradient decent
                -Loss
                - update V
            - Calculate metrics

        :param x: DataFrame [row = sentence , col = word]
        :param y: DataFrame [row = sentence tag , col = Word tag]
        :param learning_rate: [don't know if need it] # TODO check if remove
        :param x_val:[row = sentence , col = word]
        :param y_val:[row = sentence tag , col = Word tag]
        :return: metrics dict {} $# TODO check witch metrics we need
        """
        assert isinstance(x, pd.Series)
        assert isinstance(y, pd.Series)
        self.x = x
        self.y = y
        base_corpus = pd.Series(['*', '<STOP>'])
        tag_corpus = pd.Series(y.value_counts().drop(
            ['*',
             '<STOP>']).index)  # put * and Stop in the beginning of the corpus
        self.tag_corpus = base_corpus.append(tag_corpus)
        self.tag_corpus_tokenized = range(len(self.tag_corpus))
        self._translation()  # create dictionaries for tokenizing
        self._vectorize()
        self.v = np.random.uniform(-0.5, 0.5, self.num_tests)
        print('Starting Minimize')
        self.fm.minimize_loss()
        self.v = self.fm.v

    def predict(self, x):
        """
        This will work with the Viterbi
        :param x: [sentences * words]
        :return: matrix [ sentence_tags * words]
        """
        # sentence format with (['*','*'..])
        # validity check
        # check x type
        # check sentence format
        #
        tokenized_ans = self._viterbi(x)

        # translate to tags
        tag_ans = pd.Series(
            [self.token2string[token] for token in tokenized_ans])
        # assert isinstance(tag_ans, pd.DataFrame)
        return tag_ans

    def confusion(self, y_hat, y):
        """

        :param y_hat:
        :type y_hat:
        :param y:
        :type y:
        :return:
        :rtype:
        """
        assert isinstance(y_hat, pd.Series)
        assert isinstance(y, pd.Series)

        a = y.values.reshape(-1)
        b = pd.Series(a)
        # c = b.drop(['<PAD>', '*', '<STOP>', ','])
        # print(c)
        roll_y = pd.Series(
            y.values.reshape(-1))  #.drop(['<PAD>', '*', '<STOP>', ','])
        roll_y_hat = pd.Series(
            y_hat.values.reshape(-1))  #.drop(['<PAD>', '*', '<STOP>', ','])

        most_reacuent_tags = self.tag_corpus[2:12]
        sc = Score(most_reacuent_tags)
        sc.fit(roll_y, roll_y_hat)
        return sc.matrix_confusion()

    def accuracy(self, y_hat, y):
        """

        :param x:
        :type x:
        :param y:
        :type y:
        :return:
        :rtype:
        """
        assert isinstance(y_hat, pd.Series)
        assert isinstance(y, pd.Series)

        roll_y = pd.Series(y.values.reshape(-1)).drop(
            ['<PAD>', '*', '<STOP>', ','])
        roll_y_hat = pd.Series(y_hat.values.reshape(-1)).drop(
            ['<PAD>', '*', '<STOP>', ','])

        most_reacuent_tags = self.tag_corpus[:10]
        sc = Score(most_reacuent_tags)
        sc.fit(roll_y, roll_y_hat)
        return sc.over_all_acc()

    def model_function(self, next_tag, word_num, previous_tags, sentence):
        """
        :param next_tag: Next tag
        :type next_tag: int
        :param word_num: Number of word in the sentence
        :type word_num: int
        :param previous_tags: [ t_-1,t_-2] first index list of -2 position tag, second tag for -1 position tag
        :type previous_tags: [int, int]
        :param sentence: List of words
        :type sentence: np.array ['*','*', 'first','second', ..... ,'<STOP>', '<PAD>']
        :return: List of Probabilities of next_tag
        :rtype: List of Floats
        """
        assert isinstance(sentence, FinkMos)
        y_1 = self.token2string[previous_tags[0]]
        y_2_list = [self.token2string[tag] for tag in previous_tags[1]]
        y = self.token2string[next_tag]
        tup_list = []
        w_2 = str(sentence.x.iloc[word_num - 2])
        w_1 = str(sentence.x.iloc[word_num - 1])
        w = str(sentence.x.iloc[word_num])

        for y_2 in y_2_list:
            tup = [y_1, y_2, w, w_1, w_2]
            tup_list.append(tup)
        sentence.tuple_5_list = tup_list
        prop_q = sentence.prob_q2(self.v, next_tag, self.fm)

        return prop_q

    def _viterbi(self, sentence):
        """
        :param sentence: ['*', '*', 'The', 'Treasury', 'is', ..., '<STOP>']
        :type sentence: pd.Series
        :param all_tags: tokenized tags
        :type all_tags: List TODO: np.array??
        :return: List of tags
        :rtype: List
        """
        beam_width = 1
        num_words = len(sentence)  # includes '*','*' and <stop>
        sentence_fm = FinkMos(sentence, None,
                              self.tag_corpus)  # y should be None
        all_tags = self.tag_corpus_tokenized
        num_tags = len(all_tags)
        dims = (num_words, num_tags, num_tags)
        p_table = np.zeros(dims, dtype=np.float16
                           )  # pi(k,u,v) - maximum probability of tag sequence

        # ending in tags u,v at position k
        # p_table[0, 0, 0] = 1  # init
        p_table[1, :, :] = 0
        bp_table = np.ones(dims, dtype=np.int8) * -1  # -1 implies no update
        answer = [None] * num_words
        for k in range(2, num_words):
            print(str(k) + " out of " + str(num_words - 1))
            curr_tag_v_subspace = self.word2tag_subspace(sentence[k])
            if k == 2:
                prev1_tag_u_subspace = [0]
            else:
                prev1_tag_u_subspace = self.word2tag_subspace(sentence[k - 1])
            if k in [2, 3]:  # 0 -> '*' tag
                optional_tags = [0]
            else:
                optional_tags = self.word2tag_subspace(sentence[k - 2])
                if len(optional_tags) > beam_width:
                    subset_inds = np.argpartition(
                        p_table[k - 1, optional_tags, prev1_tag_u],
                        -beam_width)[-beam_width:]
                    optional_tags = [optional_tags[i] for i in subset_inds]
            for prev1_tag_u in prev1_tag_u_subspace:

                # naming relative to model function enteries
                for curr_tag_v in curr_tag_v_subspace:

                    a = p_table[k - 1, optional_tags, prev1_tag_u]
                    b = np.log(
                        self.model_function(
                            next_tag=curr_tag_v,
                            word_num=k,
                            previous_tags=[prev1_tag_u, optional_tags],
                            sentence=sentence_fm))

                    options = a + b

                    ind_in_options = np.argmax(options)
                    # taking the relevant tag from optional list
                    bp_table[k, prev1_tag_u,
                             curr_tag_v] = optional_tags[ind_in_options]
                    p_table[k, prev1_tag_u,
                            curr_tag_v] = options[ind_in_options]
        answer[num_words - 2], answer[num_words - 1] = np.unravel_index(
            bp_table[num_words - 1, :, :].argmax(),
            bp_table[num_words - 1, :, :].shape)  # argmax()

        # returns index of flatten array, unravel() gives the original 2d indices
        for k in reversed(range(num_words - 2)):
            answer[k] = bp_table[k + 2, answer[k + 1], answer[k + 2]]
        return answer

    def _translation(self):
        # assert (self.tag_corpus[0] == '*')
        self.token2string = {
            key: value
            for key, value in enumerate(self.tag_corpus)
        }  # TODO make sure that self.tag_corpus[0] is '*'
        self.string2token = {
            value: key
            for key, value in enumerate(self.tag_corpus)
        }

    def create_word2tag_subspace(self):
        # if self.word_count_corpus[word] > 5:
        for ind, word in enumerate(self.x.values):
            if word in self.word_tags:
                self.word_tags[word].add(self.string2token[self.y[ind]])
            else:
                self.word_tags[word] = {self.string2token[self.y[ind]]}

    def word2tag_subspace(self, word):
        if word in self.word_tags:
            return list(self.word_tags[word])
        else:
            return self.tag_corpus_tokenized

    def _vectorize(self):
        self.create_word2tag_subspace()

        self.fm = FinkMos(self.x, self.y, tag_corpus=self.tag_corpus)
        self.num_tests = len(self.fm.test_dict)
        self.fm.create_tuples()
        self.fm.create_feature_sparse_list_v2()

    def acc_per_tag(self, y_hat, y):
        assert isinstance(y_hat, pd.Series)
        assert isinstance(y, pd.Series)

        roll_y = pd.Series(y.values.reshape(-1)).drop(
            ['<PAD>', '*', '<STOP>', ','])
        roll_y_hat = pd.Series(y_hat.values.reshape(-1)).drop(
            ['<PAD>', '*', '<STOP>', ','])

        most_reacuent_tags = self.tag_corpus[:10]
        sc = Score(most_reacuent_tags)
        sc.fit(roll_y, roll_y_hat)
        return sc.acc_per_tag(roll_y, roll_y_hat)
Esempio n. 8
0
    def _viterbi(self, sentence):
        """
        :param sentence: ['*', '*', 'The', 'Treasury', 'is', ..., '<STOP>']
        :type sentence: pd.Series
        :param all_tags: tokenized tags
        :type all_tags: List TODO: np.array??
        :return: List of tags
        :rtype: List
        """
        beam_width = 1
        num_words = len(sentence)  # includes '*','*' and <stop>
        sentence_fm = FinkMos(sentence, None,
                              self.tag_corpus)  # y should be None
        all_tags = self.tag_corpus_tokenized
        num_tags = len(all_tags)
        dims = (num_words, num_tags, num_tags)
        p_table = np.zeros(dims, dtype=np.float16
                           )  # pi(k,u,v) - maximum probability of tag sequence

        # ending in tags u,v at position k
        # p_table[0, 0, 0] = 1  # init
        p_table[1, :, :] = 0
        bp_table = np.ones(dims, dtype=np.int8) * -1  # -1 implies no update
        answer = [None] * num_words
        for k in range(2, num_words):
            print(str(k) + " out of " + str(num_words - 1))
            curr_tag_v_subspace = self.word2tag_subspace(sentence[k])
            if k == 2:
                prev1_tag_u_subspace = [0]
            else:
                prev1_tag_u_subspace = self.word2tag_subspace(sentence[k - 1])
            if k in [2, 3]:  # 0 -> '*' tag
                optional_tags = [0]
            else:
                optional_tags = self.word2tag_subspace(sentence[k - 2])
                if len(optional_tags) > beam_width:
                    subset_inds = np.argpartition(
                        p_table[k - 1, optional_tags, prev1_tag_u],
                        -beam_width)[-beam_width:]
                    optional_tags = [optional_tags[i] for i in subset_inds]
            for prev1_tag_u in prev1_tag_u_subspace:

                # naming relative to model function enteries
                for curr_tag_v in curr_tag_v_subspace:

                    a = p_table[k - 1, optional_tags, prev1_tag_u]
                    b = np.log(
                        self.model_function(
                            next_tag=curr_tag_v,
                            word_num=k,
                            previous_tags=[prev1_tag_u, optional_tags],
                            sentence=sentence_fm))

                    options = a + b

                    ind_in_options = np.argmax(options)
                    # taking the relevant tag from optional list
                    bp_table[k, prev1_tag_u,
                             curr_tag_v] = optional_tags[ind_in_options]
                    p_table[k, prev1_tag_u,
                            curr_tag_v] = options[ind_in_options]
        answer[num_words - 2], answer[num_words - 1] = np.unravel_index(
            bp_table[num_words - 1, :, :].argmax(),
            bp_table[num_words - 1, :, :].shape)  # argmax()

        # returns index of flatten array, unravel() gives the original 2d indices
        for k in reversed(range(num_words - 2)):
            answer[k] = bp_table[k + 2, answer[k + 1], answer[k + 2]]
        return answer