Esempio n. 1
0
def load_gensimtopicmodel(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load the gensim topic modeler from files.

    :param name: name (if compact=True) or prefix (if compact=False) of the file path
    :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a topic modeler
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: GensimTopicModeler
    """
    if compact:
        modelerdict = {
            'ldatopic': LDAModeler,
            'lsitopic': LSIModeler,
            'rptopic': RPModeler
        }
        classifier_name = str(get_model_classifier_name(name))

        topicmodeler = modelerdict[classifier_name](preprocessor=preprocessor)
        topicmodeler.load_compact_model(name)
        return topicmodeler
    else:
        topicmodeler = GensimTopicModeler(preprocessor=preprocessor)
        topicmodeler.loadmodel(name)
        return topicmodeler
Esempio n. 2
0
def train_autoencoder_cosineClassifier(
        classdict,
        nb_topics,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        normalize=True,
        *args,
        **kwargs):
    """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while
    training an autoencoder as a topic model in between.

    :param classdict: training data
    :param nb_topics: number of topics, i.e., number of encoding dimensions
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
    :param args: arguments to be passed to keras model fitting
    :param kwargs: arguments to be passed to keras model fitting
    :return: a classifier that scores the short text based on the autoencoder
    :type classdict: dict
    :type nb_topics: int
    :type preprocessor: function
    :type normalize: bool
    :rtype: TopicVecCosineDistanceClassifier
    """
    # train the autoencoder
    autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor,
                                           normalize=normalize)
    autoencoder.train(classdict, nb_topics, *args, **kwargs)

    # cosine distance classifier
    return TopicVecCosineDistanceClassifier(autoencoder)
Esempio n. 3
0
def load_gensimtopicvec_cosineClassifier(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load a gensim topic model from files and return a cosine distance classifier.

    Given the prefix of the files of the topic model, return a cosine distance classifier
    based on this model, i.e., :class:`TopicVecCosineDistanceClassifier`.

    The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict),
    and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf).

    :param name: name (if compact=True) or prefix (if compact=False) of the file paths
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a classifier that scores the short text based on the topic model
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: TopicVecCosineDistanceClassifier
    """
    topicmodeler = load_gensimtopicmodel(name,
                                         preprocessor=preprocessor,
                                         compact=compact)
    return TopicVecCosineDistanceClassifier(topicmodeler)
def train_gensim_topicvec_sklearnclassifier(
        classdict,
        nb_topics,
        sklearn_classifier,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        topicmodel_algorithm='lda',
        toweigh=True,
        normalize=True,
        gensim_paramdict={},
        sklearn_paramdict={}):
    """ Train the supervised learning classifier, with features given by topic vectors.

    It trains a topic model, and with its topic vector representation, train a supervised
    learning classifier. The instantiated (not trained) scikit-learn classifier must be
    passed into the argument.

    # Reference

    Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
    "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
    *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).

    Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections,"
    WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL
    <http://dl.acm.org/citation.cfm?id=1367510>`_]

    :param classdict: training data
    :param nb_topics: number of topics in the topic model
    :param sklearn_classifier: instantiated scikit-learn classifier
    :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param topicmodel_algorithm: topic model algorithm (Default: 'lda')
    :param toweigh: whether to weigh the words using tf-idf (Default: True)
    :param normalize: whether the retrieved topic vectors are normalized (Default: True)
    :param gensim_paramdict: arguments to be passed on to the `train` method of the `gensim` topic model
    :param sklearn_paramdict: arguments to be passed on to the `fit` method of the `sklearn` classification algorithm
    :return: a trained classifier
    :type classdict: dict
    :type nb_topics: int
    :type sklearn_classifier: sklearn.base.BaseEstimator
    :type preprocessor: function
    :type topicmodel_algorithm: str
    :type toweigh: bool
    :type normalize: bool
    :type gensim_paramdict: dict
    :type sklearn_paramdict: dict
    :rtype: TopicVectorSkLearnClassifier
    """
    # topic model training
    modelerdict = {'lda': LDAModeler, 'lsi': LSIModeler, 'rp': RPModeler}
    topicmodeler = modelerdict[topicmodel_algorithm](preprocessor=preprocessor,
                                                     toweigh=toweigh,
                                                     normalize=normalize)
    topicmodeler.train(classdict, nb_topics, **gensim_paramdict)

    # intermediate classification training
    classifier = TopicVectorSkLearnClassifier(topicmodeler, sklearn_classifier)
    classifier.train(classdict, **sklearn_paramdict)

    return classifier
 def __init__(self,
              preprocessor=textpreprocess.standard_text_preprocessor_1(),
              toweigh=True,
              normalize=True):
     GensimTopicModeler.__init__(self,
                                 preprocessor=preprocessor,
                                 algorithm='rp',
                                 toweigh=toweigh,
                                 normalize=normalize)
 def __init__(self,
              preprocessor=textpreprocess.standard_text_preprocessor_1(),
              toweigh=True,
              normalize=True):
     GensimTopicModeler.__init__(self,
                                 preprocessor=preprocessor,
                                 algorithm='rp',
                                 toweigh=toweigh,
                                 normalize=normalize)
     CompactIOMachine.__init__(self, {'classifier': 'rptopic'}, 'rptopic', rp_suffices)
def load_gensim_topicvec_sklearnclassifier(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load the classifier, a wrapper that uses scikit-learn classifier, with
     feature vectors given by a topic model, from files.

    # Reference

    Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
    "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
    *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).

    Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections,"
    WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL
    <http://dl.acm.org/citation.cfm?id=1367510>`_]

    :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files
    :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a trained classifier
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: TopicVectorSkLearnClassifier
    """
    if compact:
        # load the compact model
        modelerdict = {
            'ldatopic': LDAModeler,
            'lsitopic': LSIModeler,
            'rptopic': RPModeler
        }
        topicmodel_name = cio.get_model_config_field(name, 'topicmodel')
        classifier = TopicVectorSkLearnClassifier(
            modelerdict[topicmodel_name](preprocessor=preprocessor), None)
        classifier.load_compact_model(name)
        classifier.trained = True

        # return the instance
        return classifier
    else:
        # loading topic model
        topicmodeler = load_gensimtopicmodel(name, preprocessor=preprocessor)

        # loading intermediate model
        sklearn_classifier = joblib.load(name + '.pkl')

        # the wrapped classifier
        classifier = TopicVectorSkLearnClassifier(topicmodeler,
                                                  sklearn_classifier)
        classifier.trained = True

        # return the instance
        return classifier
Esempio n. 8
0
    def __init__(self,
                 preprocessor=textpreprocess.standard_text_preprocessor_1(),
                 normalize=True):
        """ Initialize the modeler.

        :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
        :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
        :type preprocessor: function
        :type normalize: bool
        """
        self.preprocessor = preprocessor
        self.normalize = normalize
        self.trained = False
def train_autoencoder_topic_sklearnclassifier(
        classdict,
        nb_topics,
        sklearn_classifier,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        normalize=True,
        keras_paramdict={},
        sklearn_paramdict={}):
    """ Train the supervised learning classifier, with features given by topic vectors.

    It trains an autoencoder topic model, and with its encoded vector representation, train a supervised
    learning classifier. The instantiated (not trained) scikit-learn classifier must be
    passed into the argument.

    # Reference

    Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha,
    "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents,"
    *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011).

    Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections,"
    WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL
    <http://dl.acm.org/citation.cfm?id=1367510>`_]

    :param classdict: training data
    :param nb_topics: number topics, i.e., number of encoding dimensions
    :param sklearn_classifier: instantiated scikit-learn classifier
    :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param normalize: whether the retrieved topic vectors are normalized (Default: True)
    :param keras_paramdict: arguments to be passed to keras for training autoencoder
    :param sklearn_paramdict: arguemtnst to be passed to scikit-learn for fitting the classifier
    :return: a trained classifier
    :type classdict: dict
    :type nb_topics: int
    :type sklearn_classifier: sklearn.base.BaseEstimator
    :type preprocessor: function
    :type normalize: bool
    :rtype: TopicVectorSkLearnClassifier
    """
    # train the autoencoder
    autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor,
                                           normalize=normalize)
    autoencoder.train(classdict, nb_topics, **keras_paramdict)

    # intermediate classification training
    classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier)
    classifier.train(classdict, **sklearn_paramdict)

    return classifier
    def __init__(self,
                 preprocessor=textpreprocess.standard_text_preprocessor_1(),
                 algorithm='lda',
                 toweigh=True,
                 normalize=True):
        """ Initialize the topic modeler.

        :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
        :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda)
        :param toweigh: whether to weigh the words using tf-idf. (Default: True)
        :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
        :type preprocessor: function
        :type algorithm: str
        :type toweigh: bool
        """
        LatentTopicModeler.__init__(self, preprocessor=preprocessor, normalize=normalize)
        self.algorithm = algorithm
        self.toweigh = toweigh
def load_autoencoder_topicmodel(
        name,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        compact=True):
    """ Load the autoencoding topic model from files.

    :param name: name (if compact=True) or prefix (if compact=False) of the paths of the model files
    :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: an autoencoder as a topic modeler
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: generators.bow.AutoEncodingTopicModeling.AutoencodingTopicModeler
    """
    autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor)
    if compact:
        autoencoder.load_compact_model(name)
    else:
        autoencoder.loadmodel(name)
    return autoencoder
def load_autoencoder_cosineClassifier(name,
                                      preprocessor=textpreprocess.standard_text_preprocessor_1(),
                                      compact=True):
    """ Load an autoencoder from files for topic modeling, and return a cosine classifier.

    Given the prefix of the file paths, load the model into files, with name given by the prefix.
    There are files with names ending with "_encoder.json" and "_encoder.h5", which are
    the JSON and HDF5 files for the encoder respectively.
    They also include a gensim dictionary (.gensimdict).

    :param name: name (if compact=True) or prefix (if compact=False) of the file paths
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param compact: whether model file is compact (Default: True)
    :return: a classifier that scores the short text based on the autoencoder
    :type name: str
    :type preprocessor: function
    :type compact: bool
    :rtype: TopicVecCosineDistanceClassifier
    """
    autoencoder = load_autoencoder_topicmodel(name, preprocessor=preprocessor, compact=compact)
    return TopicVecCosineDistanceClassifier(autoencoder)
Esempio n. 13
0
def train_gensimtopicvec_cosineClassifier(
        classdict,
        nb_topics,
        preprocessor=textpreprocess.standard_text_preprocessor_1(),
        algorithm='lda',
        toweigh=True,
        normalize=True,
        *args,
        **kwargs):
    """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while
    training a gensim topic model in between.

    :param classdict: training data
    :param nb_topics: number of latent topics
    :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`)
    :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda)
    :param toweigh: whether to weigh the words using tf-idf. (Default: True)
    :param normalize: whether the retrieved topic vectors are normalized. (Default: True)
    :param args: arguments to pass to the `train` method for gensim topic models
    :param kwargs: arguments to pass to the `train` method for gensim topic models
    :return: a classifier that scores the short text based on the topic model
    :type classdict: dict
    :type nb_topics: int
    :type preprocessor: function
    :type algorithm: str
    :type toweigh: bool
    :type normalize: bool
    :rtype: TopicVecCosineDistanceClassifier
    """
    # train topic model
    topicmodeler = GensimTopicModeler(preprocessor=preprocessor,
                                      algorithm=algorithm,
                                      toweigh=toweigh,
                                      normalize=normalize)
    topicmodeler.train(classdict, nb_topics, *args, **kwargs)

    # cosine distance classifier
    return TopicVecCosineDistanceClassifier(topicmodeler)