Esempio n. 1
0
 def __init__(self, filepath, data_path, embedding_path, embedding=None):
     super().__init__()
     self._tokenizer.fit_on_texts(score_model_helper.get_dataframe(data_path)['essay'])
     self._vocab_size = len(self._tokenizer.word_index) + 1
     if embedding is None:
         self._embedding = self.get_embedding_matrix(embedding_path)
     else:
         self._embedding = embedding
     self._model.add(Embedding(self._vocab_size, 300, weights=[self._embedding], input_length=200, trainable=False))
     self._model.add(LSTM(128, dropout=0.1, return_sequences=True))
     self._model.add(GlobalMaxPooling1D())
     self._model.add(Dense(64, activation='relu'))
     self._model.add(Dense(1, activation='sigmoid'))
     self._model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mape', 'mse'])
     self._filepath = filepath
     self.__data_path = data_path
Esempio n. 2
0
    def load_data(self, filepath=None):
        """
        Loads the data into the score model, and then initiates model training

        Parameters
        ----------
        filepath : str
            Should be a filepath to a .csv file with an 'essay_id', 'essay_set', 'essay', and 'domain1_score' column.
            If not provided, then the default filepath will be used in its place if one exists.

        Returns
        -------
        bool
            True if the model training was successful, otherwise False.
        """
        y = pandas.DataFrame(np.empty(0, dtype=[('essay_id', 'int'), ('normal', 'float32')]))

        # Get only the essays from the essay set you will be grading against
        if filepath is not None:
            self.__data_path = filepath
        x = score_model_helper.get_dataframe(self.__data_path)  # Training data

        for i in x.index.values:
            set_number = x.loc[i, 'essay_set']
            y.loc[i, 'essay_id'] = x.loc[i, 'essay_id']
            if set_number == 1:
                y.loc[i, 'normal'] = (x.loc[i, 'domain1_score'] - 2) / 10
            if set_number == 2:
                y.loc[i, 'normal'] = (x.loc[i, 'domain1_score'] - 1) / 5
            if set_number == 3:
                y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 3
            if set_number == 4:
                y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 3
            if set_number == 5:
                y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 4
            if set_number == 6:
                y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 4
            if set_number == 7:
                y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 30
            if set_number == 8:
                y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 60

        return self.train_and_test(x, y, 4, 4)
Esempio n. 3
0
    def load_data(self, filepath=None):
        """
        Loads the data into the score model, and then initiates model training

        Parameters
        ----------
        filepath : str
            Should be a filepath to a .csv file with an 'essay_id', 'essay' and 'comments' column, where the 'comments'
            column should contain 'ID#,ORG#,STY#', where the # is either 1, 2, or 3. If not provided, then the default
            filepath will be used in its place if one exists.

        Returns
        -------
        bool
            True if the model training was successful, otherwise False.
        """
        y = pandas.DataFrame(np.empty(0, dtype=[('essay_id', 'int'), ('normal', 'float32')]))

        # Get only the essays from the essay set you will be grading against
        if filepath is not None:
            self.__data_path = filepath
        x = score_model_helper.get_dataframe(self.__data_path)  # Training data

        for i in x.index.values:
            comment = x.loc[i, 'comments'].split(',')[2]
            y.loc[i, 'essay_id'] = x.loc[i, 'essay_id']

            if comment.find('1') != -1:
                y.loc[i, 'normal'] = 0.0
            else:
                if comment.find('2') != -1:
                    y.loc[i, 'normal'] = 0.5
                else:
                    y.loc[i, 'normal'] = 1.0

        return self.train_and_test(x, y, 8, 4)