Example #1
0
    def predict(self, test_set, test_labels_vector=None, report_accuracy=True):
        """
        uses the trained model to predict the test set
        :param test_set: the test set
        :param test_labels_vector: the labels vector of the test set for accuracy computation
        :param report_accuracy: defines whether to report the prediction or not
        """

        if self.model_name:
            from pyfasttext import FastText
            predictor = FastText()
            predictor.load_model('ft_extras/'+self.model_name+'.bin')
            predicted_labels = predictor.predict_proba(test_set)
            if report_accuracy and test_labels_vector:
                test_set_size = len(test_set)
                correct_predictions = 0
                invalid_labels = 0
                for index, labels in enumerate(predicted_labels):
                    if len(labels) != 0:
                        best_label = max(labels,key=lambda label:label[1])
                        if best_label[0] == test_labels_vector[index]:
                            correct_predictions += 1
                    else:
                        invalid_labels += 1
                        continue
                print('Prediction accuracy:{}\n'.format(correct_predictions / (test_set_size - invalid_labels)))
        else:
            print('Please use the train method to train a model first.')
            return
Example #2
0
class Fasttext_clf(BaseEstimator, ClassifierMixin):
    data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz')
    def __init__(self, path=data_path):
        self.model = FastText(path)
        self.default = '0'

    def fit(self, X, y):
        return self

    def predict(self, X):
        results = []
        if isinstance(X, str):  #
            res=self.model.predict_single(X)
            results = results + [self.default if not res  else res]
        elif isinstance(X, list):
           # X=[(x) for x in X]
            res = self.model.predict(X)
            results = results + self.model.predict(X)
        return results

    def predict_proba(self, X):
        results = []
        if isinstance(X, str):  #
            results = results + [self.model.predict_proba_single(X)]
        elif isinstance(X, list):
            #X=[(x+'\n') for x in X]
            results = results + self.model.predict_proba(X)
        return results
Example #3
0
# -*- encoding: utf-8 -*-

from pyfasttext import FastText
classifier = FastText('/opt/app/fasttext/geek-model-v1.bin')

texts = [
    '我 是 温涛 发   温总 , 最近 招聘 有 什么 新 的 进展 吗 ?',
    '我 新建 个 QQ 工作 群 , 以后 工作 的 事情 可以 在 群里 讨论 , 你加 一下'
]

#classifier = fasttext.load_model('/opt/app/fasttext/geek-model-v1.bin')

print 'load model success.'

labels = classifier.predict_proba(texts, 1)

print labels

中文输入哈哈哈
Example #4
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: pyfasttext 可以解决fasttext "C++扩展无法分配足够的内存"的问题。总结:1. 训练使用原版fasttext库;2.预测使用pyfasttext
"""

from pyfasttext import FastText
model = FastText('classify_model.bin')
print(model.labels)
print(model.nlabels)
texts = [
    '吃 什么 止泻 快 _ 宝宝 拉肚子 _ 酸味 重 _ 专题 解答 ',
    '增高 _ 正确 长高 方法 _ 刺激 骨骼 二次 生长发育   增高 精准 找到 长高 办法   ,   有助 孩子 长高 的 方法   ,'
]

# Or with the probability
labels = model.predict_proba(texts, k=2)
print(labels)

print(model.predict(texts, k=1))
Example #5
0
class FasttextClassifier(TextClassificationModel):
    FT_MODEL_KEY = "ft_classifier.model"

    REQUIREMENTS = ["pyfasttext"]

    def __init__(self,
                 ft_classifier_path,
                 transform_func=None,
                 init_func=None,
                 **kwargs):
        super(FasttextClassifier, self).__init__(**kwargs)

        if not os.path.isfile(ft_classifier_path):
            raise FileNotFoundError('File does not exist: %s' %
                                    ft_classifier_path)

        if self.name is None:
            # use filename without extension as name
            self.name = simplify(get_file_name(ft_classifier_path))

        self.init_func = init_func
        self.transform_func = transform_func

        self.add_requirements(FasttextClassifier.REQUIREMENTS)

        self.add_file(FasttextClassifier.FT_MODEL_KEY, ft_classifier_path)
        self._init_model()

    @overrides
    def _init_model(self):
        self.model_instance = FastText(
            self.get_file(FasttextClassifier.FT_MODEL_KEY))
        if self.init_func:
            self.init_func(model=self)

    @overrides
    def _save_model(self, output_path):
        del self.model_instance

    @overrides
    def _predict(self, data, limit=None, **kwargs):
        if not limit or limit > self.model_instance.nlabels:
            limit = self.model_instance.nlabels

        if self.transform_func:
            if isinstance(self.transform_func, types.FunctionType):
                data = self.transform_func(data, model=self, **kwargs)
            else:
                self._log.warning(
                    "Provided data transformer is not a function.")

        prediction = self.model_instance.predict_proba_single(data, k=limit)
        if prediction == []:
            # add no predictions of failed to predict something.
            # TODO is this the best solution
            new_labels = ["NO_PREDICTION"] * limit
            new_probabilities = [0.0] * limit
            prediction = list(zip(new_labels, new_probabilities))

        return pd.DataFrame(prediction, columns=[ITEM_COLUMN, SCORE_COLUMN])

    @overrides
    def predict_batch(self, data, limit=None, **kwargs):
        # Todo predict batch better function
        if not limit or limit > self.model_instance.nlabels:
            limit = self.model_instance.nlabels

        if self.transform_func:
            if isinstance(self.transform_func, types.FunctionType):
                # transform is only on a single item
                data = [
                    self.transform_func(item, model=self, **kwargs)
                    for item in data
                ]
            else:
                self._log.warning(
                    "Provided data transformer is not a function.")

        prediction = self.model_instance.predict_proba(data, k=limit)

        labels = []
        probabilities = []

        for ind, entry in enumerate(prediction):
            if entry == []:
                new_labels = ["NO_PREDICTION"] * limit
                new_probabilities = [0.0] * limit
            else:
                new_labels, new_probabilities = list(zip(*entry))
            labels += [list(new_labels)]
            probabilities += [list(new_probabilities)]

        df = pd.DataFrame()
        df[ITEM_COLUMN] = labels
        df[SCORE_COLUMN] = probabilities

        return df