Exemple #1
0
    def score(self, shorttext):
        """ Calculate the scores for all the class labels for the given short sentence.

        Given a short sentence, calculate the classification scores for all class labels,
        returned as a dictionary with key being the class labels, and values being the scores.
        If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: a short sentence
        :return: a dictionary with keys being the class labels, and values being the corresponding classification scores
        :type shorttext: str
        :rtype: dict
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        input_matrix = self.translate_shorttext_intfeature_matrix(shorttext)
        prediction = self.model.predict(np.array([input_matrix]))

        scoredict = {
            label: prediction[0][idx]
            for idx, label in enumerate(self.classlabels)
        }

        return scoredict
    def savemodel(self, nameprefix):
        """ Save the trained model into files.

        Given the prefix of the file paths, save the model into files, with name given by the prefix.
        There will be three files produced, one name ending with "_classlabels.txt", one name
        ending with ".json", and one name ending with ".h5". For shorttext>=0.4.0, another file
        with extension "_config.json" would be created.

        If there is no trained model, a `ModelNotTrainedException` will be thrown.

        :param nameprefix: prefix of the file path
        :return: None
        :type nameprefix: str
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        kerasio.save_model(nameprefix, self.model)
        labelfile = open(nameprefix + '_classlabels.txt', 'w')
        labelfile.write('\n'.join(self.classlabels))
        labelfile.close()
        json.dump(
            {
                'with_gensim': False,
                'maxlen': self.maxlen,
                'vecsize': self.vecsize
            }, open(nameprefix + '_config.json', 'w'))
    def score(self, shorttext):
        """ Calculate the scores for all the class labels for the given short sentence.

        Given a short sentence, calculate the classification scores for all class labels,
        returned as a dictionary with key being the class labels, and values being the scores.
        If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`.
        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: a short sentence
        :return: a dictionary with keys being the class labels, and values being the corresponding classification scores
        :type shorttext: str
        :rtype: dict
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        if self.with_gensim:
            # tokenize and pad input text
            matrix = self.process_text(shorttext)
        else:
            # retrieve vector
            matrix = np.array([self.shorttext_to_matrix(shorttext)])

        # classification using the neural network
        predictions = self.model.predict(matrix)

        # wrangle output result
        scoredict = {}
        for idx, classlabel in zip(range(len(self.classlabels)),
                                   self.classlabels):
            scoredict[classlabel] = predictions[0][idx]

        return scoredict
    def score(self, shorttext):
        """ Calculate the scores for all the class labels for the given short sentence.

        Given a short sentence, calculate the classification scores for all class labels,
        returned as a dictionary with key being the class labels, and values being the scores.
        If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: a short sentence
        :return: a dictionary with keys being the class labels, and values being the corresponding classification scores
        :type shorttext: str
        :rtype: dict
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

            # retrieve vector
        embedvec = np.array(self.shorttext_to_embedvec(shorttext))

        # classification using the neural network
        predictions = self.model.predict(np.array([embedvec]))

        # wrangle output result
        scoredict = {classlabel: predictions[0][idx] for idx, classlabel in enumerate(self.classlabels)}
        return scoredict
Exemple #5
0
    def savemodel(self, nameprefix):
        """ Save the trained model into files.

        Given the prefix of the file paths, save the model into files, with name given by the prefix.
        There will be give files produced, one name ending with "_classlabels.txt", one with ".json",
        one with ".h5", one with "_labelidx.pkl", and one with "_dictionary.dict".

        If there is no trained model, a `ModelNotTrainedException` will be thrown.

        :param nameprefix: prefix of the file path
        :return: None
        :type nameprefix: str
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        kerasio.save_model(nameprefix, self.model)

        self.dictionary.save(nameprefix+'_dictionary.dict')

        labelfile = open(nameprefix+'_classlabels.txt', 'w')
        labelfile.write('\n'.join(self.classlabels))
        labelfile.close()

        pickle.dump(self.labels2idx, open(nameprefix+'_labelidx.pkl', 'wb'))
Exemple #6
0
    def score(self, shorttext):
        """ Calculate the scores for all the class labels for the given short sentence.

        Given a short sentence, calculate the classification scores for all class labels,
        returned as a dictionary with key being the class labels, and values being the scores.
        If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: a short sentence
        :return: a dictionary with keys being the class labels, and values being the corresponding classification scores
        :type shorttext: str
        :rtype: dict
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        vec = self.shorttext_to_embedvec(shorttext)
        scoredict = {}
        for classtype in self.addvec:
            try:
                scoredict[classtype] = self.simfcn(vec, self.addvec[classtype])
            except ValueError:
                scoredict[classtype] = np.nan
        return scoredict
Exemple #7
0
    def savemodel(self, nameprefix):
        """ Save the model with names according to the prefix.

        Given the prefix of the file paths, save the corresponding topic model. The files
        include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
        and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param nameprefix: prefix of the file paths
        :return: None
        :raise: ModelNotTrainedException
        :type nameprefix: str
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        parameters = {}
        parameters['nb_topics'] = self.nb_topics
        parameters['toweigh'] = self.toweigh
        parameters['algorithm'] = self.algorithm
        parameters['classlabels'] = self.classlabels
        json.dump(parameters, open(nameprefix + '.json', 'w'))

        self.dictionary.save(nameprefix + '.gensimdict')
        self.topicmodel.save(nameprefix + '.gensimmodel')
        self.matsim.save(nameprefix + '.gensimmat')
        if self.toweigh:
            self.tfidf.save(nameprefix + '.gensimtfidf')
    def savemodel(self, nameprefix, save_complete_autoencoder=True):
        """ Save the model with names according to the prefix.

        Given the prefix of the file paths, save the model into files, with name given by the prefix.
        There are files with names ending with "_encoder.json" and "_encoder.h5", which are
        the JSON and HDF5 files for the encoder respectively. They also include a gensim dictionary (.gensimdict).

        If `save_complete_autoencoder` is True,
        then there are also files with names ending with "_decoder.json" and "_decoder.h5".

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param nameprefix: prefix of the paths of the file
        :param save_complete_autoencoder: whether to store the decoder and the complete autoencoder (Default: True; but False for version <= 0.2.1)
        :return: None
        :type nameprefix: str
        :type save_complete_autoencoder: bool
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        parameters = {}
        parameters['nb_topics'] = self.nb_topics
        parameters['classlabels'] = self.classlabels
        json.dump(parameters, open(nameprefix + '.json', 'wb'))

        self.dictionary.save(nameprefix + '.gensimdict')
        kerasio.save_model(nameprefix + '_encoder', self.encoder)
        if save_complete_autoencoder:
            kerasio.save_model(nameprefix + '_decoder', self.decoder)
            kerasio.save_model(nameprefix + '_autoencoder', self.autoencoder)
        pickle.dump(self.classtopicvecs,
                    open(nameprefix + '_classtopicvecs.pkl', 'w'))
    def getvector(self, shorttext):
        """ Retrieve the topic vector representation of the given short text.

        If the topic modeler does not have a trained model, it will raise `ModelNotTrainedException`.

        :param shorttext: short text
        :return: topic vector representation
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        return self.topicmodeler.retrieve_topicvec(shorttext)
Exemple #10
0
    def retrieve_corpus_topicdist(self, shorttext):
        """ Calculate the topic vector representation of the short text, in the corpus form.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: text to be represented
        :return: topic vector in the corpus form
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: list
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        bow = self.retrieve_bow(shorttext)
        return self.topicmodel[self.tfidf[bow] if self.toweigh else bow]
Exemple #11
0
    def savemodel(self, nameprefix):
        """ Save the trained model into files.

        Given the prefix of the file paths, save the model into files, with name given by the prefix,
        and add "_embedvecdict.pickle" at the end. If there is no trained model, a `ModelNotTrainedException`
        will be thrown.

        :param nameprefix: prefix of the file path
        :return: None
        :type nameprefix: str
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'w'))
    def classify(self, shorttext):
        """ Give the highest-scoring class of the given short text according to the classifier.

        If neither :func:`~train` nor :func:`~loadmodel` was run, or if the
        topic model was not trained, it will raise `ModelNotTrainedException`.

        :param shorttext: short text
        :return: class label of the classification result of the given short text
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: str
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        topicvec = self.getvector(shorttext)
        return self.classlabels[self.classifier.predict([topicvec])[0]]
    def preprocess_text_correct(self, text):
        """ A generator that output numpy vectors for the text for correction.

        ModelNotTrainedException is raised if the model has not been trained.

        :param text: text
        :return: generator that outputs the numpy vectors for correction
        :type text: str
        :rtype: generator
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise ce.ModelNotTrainedException()
        for token in nospace_tokenize(text):
            xvec, _ = self.binarizer.change_nothing(token, self.operation)
            yield xvec
    def retrieve_topicvec(self, shorttext):
        """ Calculate the topic vector representation of the short text.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: short text
        :return: encoded vector representation of the short text
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        bow_vector = self.retrieve_bow_vector(shorttext)
        encoded_vec = self.encoder.predict(np.array([bow_vector]))[0]
        if self.normalize:
            encoded_vec /= np.linalg.norm(encoded_vec)
        return encoded_vec
    def correct(self, word):
        """ Recommend a spell correction to given the word.

        :param word: a given word
        :return: recommended correction
        :type word: str
        :rtype: str
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise ce.ModelNotTrainedException()

        xmat = np.array(
            [xvec.transpose() for xvec in self.preprocess_text_correct(word)])
        yvec = self.model.predict(xmat)

        maxy = yvec.argmax(axis=-1)
        return ' '.join([self.dictionary[y] for y in maxy[0]])
    def get_batch_cos_similarities(self, shorttext):
        """ Calculate the score, which is the cosine similarity with the topic vector of the model,
        of the short text against each class labels.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: short text
        :return: dictionary of scores of the text to all classes
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: dict
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        simdict = {}
        for label in self.classtopicvecs:
            simdict[label] = 1 - cosine(self.classtopicvecs[label],
                                        self.retrieve_topicvec(shorttext))
        return simdict
    def savemodel(self, nameprefix):
        """ Save the model.

        Save the topic model and the trained scikit-learn classification model. The scikit-learn
        model will have the name `nameprefix` followed by the extension `.pkl`. The
        topic model is the same as the one in `LatentTopicModeler`.

        If neither :func:`~train` nor :func:`~loadmodel` was run, or if the
        topic model was not trained, it will raise `ModelNotTrainedException`.

        :param nameprefix: prefix of the paths of the model files
        :return: None
        :raise: ModelNotTrainedException
        :type nameprefix: str
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        self.topicmodeler.savemodel(nameprefix)
        joblib.dump(self.classifier, nameprefix + '.pkl')
Exemple #18
0
    def savemodel(self, prefix, final=False):
        """ Save the trained models into multiple files.

        To save it compactly, call :func:`~save_compact_model`.

        If `final` is set to `True`, the model cannot be further trained.

        If there is no trained model, a `ModelNotTrainedException` will be thrown.

        :param prefix: prefix of the file path
        :param final: whether the model is final (that should not be trained further) (Default: False)
        :return: None
        :type prefix: str
        :type final: bool
        :raise: ModelNotTrainedException
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        # save hyperparameters
        json.dump({
            'vecsize': self.vecsize,
            'latent_dim': self.latent_dim
        }, open(prefix + '_s2s_hyperparam.json', 'wb'))

        # save whole model
        if final:
            self.model.save_weights(prefix + '.h5')
        else:
            self.model.save(prefix + '.h5')
        open(prefix + '.json', 'wb').write(self.model.to_json())

        # save encoder and decoder
        if final:
            self.encoder_model.save_weights(prefix + '_encoder.h5')
            self.decoder_model.save_weights(prefix + '_decoder.h5')
        else:
            self.encoder_model.save(prefix + '_encoder.h5')
            self.decoder_model.save(prefix + '_decoder.h5')
        open(prefix + '_encoder.json',
             'wb').write(self.encoder_model.to_json())
        open(prefix + '_decoder.json',
             'wb').write(self.decoder_model.to_json())
    def savemodel(self, prefix):
        """ Save the model.

        :param prefix: prefix of the model path
        :return: None
        :type prefix: str
        """
        if not self.trained:
            raise ce.ModelNotTrainedException()
        kerasio.save_model(prefix, self.model)
        self.dictionary.save(prefix + '_vocabs.gensimdict')
        parameters = {
            'alph': self.alph,
            'special_signals': self.specialsignals,
            'operation': self.operation,
            'batchsize': self.batchsize,
            'nb_hiddenunits': self.nb_hiddenunits
        }
        json.dump(parameters, open(prefix + '_config.json', 'w'))
Exemple #20
0
    def savemodel(self, nameprefix):
        """ Save the logistic stacked model into files.

        Save the stacked model into files. Note that the intermediate classifiers
        are not saved. Users are advised to save those classifiers separately.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param nameprefix: prefix of the files
        :return: None
        :raise: ModelNotTrainedException
        :type nameprefix: str
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        stackedmodeldict = {'classifiers': self.classifier2idx,
                            'classlabels': self.classlabels}
        pickle.dump(stackedmodeldict, open(nameprefix+'_stackedlogistics.pkl', 'wb'))
        kerasio.save_model(nameprefix+'_stackedlogistics', self.model)
    def score(self, shorttext):
        """ Calculate the score, which is the cosine similarity with the topic vector of the model,
        of the short text against each class labels.

        If neither :func:`~train` nor :func:`~loadmodel` was run, or if the
        topic model was not trained, it will raise `ModelNotTrainedException`.

        :param shorttext: short text
        :return: dictionary of scores of the text to all classes
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: dict
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        topicvec = self.getvector(shorttext)
        scoredict = {
            classlabel: self.classifier.score([topicvec], [classidx])
            for classidx, classlabel in enumerate(self.classlabels)
        }
        return scoredict
Exemple #22
0
    def retrieve_topicvec(self, shorttext):
        """ Calculate the topic vector representation of the short text.

        This function calls :func:`~retrieve_corpus_topicdist`.

        If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.

        :param shorttext: text to be represented
        :return: topic vector
        :raise: ModelNotTrainedException
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        if not self.trained:
            raise e.ModelNotTrainedException()
        topicdist = self.retrieve_corpus_topicdist(shorttext)
        topicvec = np.zeros(self.nb_topics)
        for topicid, frac in topicdist:
            topicvec[topicid] = frac
        if self.normalize:
            topicvec /= np.linalg.norm(topicvec)
        return topicvec
Exemple #23
0
    def score(self, shorttext, default_score=0.0):
        """ Calculate the score, which is the cosine similarity with the topic vector of the model,
        of the short text against each class labels.

        If neither :func:`~train` nor :func:`~loadmodel` was run, or if the
        topic model was not trained, it will raise `ModelNotTrainedException`.

        :param shorttext: short text
        :param default_score: default score if no score is assigned (Default: 0.0)
        :return: dictionary of scores of the text to all classes
        :raise: ModelNotTrainedException
        :type shorttext: str
        :type default_score: float
        :rtype: dict
        """
        if not self.trained:
            raise e.ModelNotTrainedException()

        topicvec = self.getvector(shorttext)
        scoredict = {classlabel: self.classifier.score([topicvec], [classidx])
                     for classidx, classlabel in zip(range(len(self.classlabels)), self.classlabels)}
        return dict(scoredict)
Exemple #24
0
 def __contains__(self, shorttext):
     if not self.trained:
         raise e.ModelNotTrainedException()
     return True
 def preprocess_text_correct(self, text):
     if not self.trained:
         raise ce.ModelNotTrainedException()
     for token in nospace_tokenize(text):
         xvec = self.binarizer.change_nothing(token, self.operation)
         yield xvec