def predict(self, test_set, test_labels_vector=None, report_accuracy=True): """ uses the trained model to predict the test set :param test_set: the test set :param test_labels_vector: the labels vector of the test set for accuracy computation :param report_accuracy: defines whether to report the prediction or not """ if self.model_name: from pyfasttext import FastText predictor = FastText() predictor.load_model('ft_extras/'+self.model_name+'.bin') predicted_labels = predictor.predict_proba(test_set) if report_accuracy and test_labels_vector: test_set_size = len(test_set) correct_predictions = 0 invalid_labels = 0 for index, labels in enumerate(predicted_labels): if len(labels) != 0: best_label = max(labels,key=lambda label:label[1]) if best_label[0] == test_labels_vector[index]: correct_predictions += 1 else: invalid_labels += 1 continue print('Prediction accuracy:{}\n'.format(correct_predictions / (test_set_size - invalid_labels))) else: print('Please use the train method to train a model first.') return
class Fasttext_clf(BaseEstimator, ClassifierMixin): data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz') def __init__(self, path=data_path): self.model = FastText(path) self.default = '0' def fit(self, X, y): return self def predict(self, X): results = [] if isinstance(X, str): # res=self.model.predict_single(X) results = results + [self.default if not res else res] elif isinstance(X, list): # X=[(x) for x in X] res = self.model.predict(X) results = results + self.model.predict(X) return results def predict_proba(self, X): results = [] if isinstance(X, str): # results = results + [self.model.predict_proba_single(X)] elif isinstance(X, list): #X=[(x+'\n') for x in X] results = results + self.model.predict_proba(X) return results
# -*- encoding: utf-8 -*- from pyfasttext import FastText classifier = FastText('/opt/app/fasttext/geek-model-v1.bin') texts = [ '我 是 温涛 发 温总 , 最近 招聘 有 什么 新 的 进展 吗 ?', '我 新建 个 QQ 工作 群 , 以后 工作 的 事情 可以 在 群里 讨论 , 你加 一下' ] #classifier = fasttext.load_model('/opt/app/fasttext/geek-model-v1.bin') print 'load model success.' labels = classifier.predict_proba(texts, 1) print labels 中文输入哈哈哈
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: pyfasttext 可以解决fasttext "C++扩展无法分配足够的内存"的问题。总结:1. 训练使用原版fasttext库;2.预测使用pyfasttext """ from pyfasttext import FastText model = FastText('classify_model.bin') print(model.labels) print(model.nlabels) texts = [ '吃 什么 止泻 快 _ 宝宝 拉肚子 _ 酸味 重 _ 专题 解答 ', '增高 _ 正确 长高 方法 _ 刺激 骨骼 二次 生长发育 增高 精准 找到 长高 办法 , 有助 孩子 长高 的 方法 ,' ] # Or with the probability labels = model.predict_proba(texts, k=2) print(labels) print(model.predict(texts, k=1))
class FasttextClassifier(TextClassificationModel): FT_MODEL_KEY = "ft_classifier.model" REQUIREMENTS = ["pyfasttext"] def __init__(self, ft_classifier_path, transform_func=None, init_func=None, **kwargs): super(FasttextClassifier, self).__init__(**kwargs) if not os.path.isfile(ft_classifier_path): raise FileNotFoundError('File does not exist: %s' % ft_classifier_path) if self.name is None: # use filename without extension as name self.name = simplify(get_file_name(ft_classifier_path)) self.init_func = init_func self.transform_func = transform_func self.add_requirements(FasttextClassifier.REQUIREMENTS) self.add_file(FasttextClassifier.FT_MODEL_KEY, ft_classifier_path) self._init_model() @overrides def _init_model(self): self.model_instance = FastText( self.get_file(FasttextClassifier.FT_MODEL_KEY)) if self.init_func: self.init_func(model=self) @overrides def _save_model(self, output_path): del self.model_instance @overrides def _predict(self, data, limit=None, **kwargs): if not limit or limit > self.model_instance.nlabels: limit = self.model_instance.nlabels if self.transform_func: if isinstance(self.transform_func, types.FunctionType): data = self.transform_func(data, model=self, **kwargs) else: self._log.warning( "Provided data transformer is not a function.") prediction = self.model_instance.predict_proba_single(data, k=limit) if prediction == []: # add no predictions of failed to predict something. # TODO is this the best solution new_labels = ["NO_PREDICTION"] * limit new_probabilities = [0.0] * limit prediction = list(zip(new_labels, new_probabilities)) return pd.DataFrame(prediction, columns=[ITEM_COLUMN, SCORE_COLUMN]) @overrides def predict_batch(self, data, limit=None, **kwargs): # Todo predict batch better function if not limit or limit > self.model_instance.nlabels: limit = self.model_instance.nlabels if self.transform_func: if isinstance(self.transform_func, types.FunctionType): # transform is only on a single item data = [ self.transform_func(item, model=self, **kwargs) for item in data ] else: self._log.warning( "Provided data transformer is not a function.") prediction = self.model_instance.predict_proba(data, k=limit) labels = [] probabilities = [] for ind, entry in enumerate(prediction): if entry == []: new_labels = ["NO_PREDICTION"] * limit new_probabilities = [0.0] * limit else: new_labels, new_probabilities = list(zip(*entry)) labels += [list(new_labels)] probabilities += [list(new_probabilities)] df = pd.DataFrame() df[ITEM_COLUMN] = labels df[SCORE_COLUMN] = probabilities return df