class Fasttext_clf(BaseEstimator, ClassifierMixin): data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz') def __init__(self, path=data_path): self.model = FastText(path) self.default = '0' def fit(self, X, y): return self def predict(self, X): results = [] if isinstance(X, str): # res=self.model.predict_single(X) results = results + [self.default if not res else res] elif isinstance(X, list): # X=[(x) for x in X] res = self.model.predict(X) results = results + self.model.predict(X) return results def predict_proba(self, X): results = [] if isinstance(X, str): # results = results + [self.model.predict_proba_single(X)] elif isinstance(X, list): #X=[(x+'\n') for x in X] results = results + self.model.predict_proba(X) return results
def collect_docs(p, lang_detection_model_name=None, lang='en'): if lang_detection_model_name != None: from pyfasttext import FastText model_path = SparkFiles.get(lang_detection_model_name) model = FastText(model_path) regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) result = [] lines = list(p) indices = [i for i, line in enumerate(lines) if regex.search(line.strip())] for i in range(0, len(indices)): idx = indices[i] content = lines[idx + 1] paras = re.findall('<PAR>(.*?)</PAR>', content, re.DOTALL) if model: #filter only english paras langs = model.predict(paras) en_paras = list(filter(lambda p: lang in p[1], zip(paras, langs))) paras = list(map(lambda pair: pair[0], en_paras)) if paras: url = lines[idx].strip() result.append((url, paras)) return result
t0 = time.time() t1 = time.time() # terminal command # fasttext/fastText-0.1.0/fasttext supervised -input /media/sf_NLP/input/train_set.txt -output /media/sf_NLP/input/fasttext.model -label __label__ -lr 1.0 -minCount 3 -wordNgrams 1,2 -dim 2000 #classifier = fasttext.supervised("/media/sf_NLP/input/train_set.txt","/media/sf_NLP/input/#fasttext.model",label_prefix="__label__") #print('train_use: ', time.time()-t1) #t1 = time.time() # loadModel # classifier = fasttext.load_model('/media/sf_NLP/input/fasttext.model.bin', label_prefix='__label__') classifier = FastText('/media/sf_NLP/input/fasttext.model.bin') # test # result = classifier.test("/media/sf_NLP/input/test_set.txt") with open("/media/sf_NLP/input/test_set.txt") as fr: lines = fr.readlines() labels_predict = [e[0] for e in classifier.predict(lines)] #预测输出结果为二维形式 # labels_predict = [e[0][0] for e in classifier.predict_proba(lines)] print('prediction done: ', time.time()-t1) t1 = time.time() i = 0 fid0 = open('baseline.csv', 'w') fid0.write('id,class'+'\n') for item in labels_predict: fid0.write(str(i) + ',' + str(item)+'\n') i = i+1 fid0.close() print('result generate done: ', time.time()-t1) print('time use: ', time.time()-t0)
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: pyfasttext 可以解决fasttext "C++扩展无法分配足够的内存"的问题。总结:1. 训练使用原版fasttext库;2.预测使用pyfasttext """ from pyfasttext import FastText model = FastText('classify_model.bin') print(model.labels) print(model.nlabels) texts = [ '吃 什么 止泻 快 _ 宝宝 拉肚子 _ 酸味 重 _ 专题 解答 ', '增高 _ 正确 长高 方法 _ 刺激 骨骼 二次 生长发育 增高 精准 找到 长高 办法 , 有助 孩子 长高 的 方法 ,' ] # Or with the probability labels = model.predict_proba(texts, k=2) print(labels) print(model.predict(texts, k=1))
""" # # Option_test = np.resize(Option_test, (4, 2000, 13)) # # print('Option_test[0][0][0] = ', Option_test[0][0][0]) # # print('Option_test[0][2][0] = ', Option_test[0][2][0]) Option_test = np.transpose(Option_test, [1, 0, 2, 3]) # # print('Option_test[0][0][0] = ', Option_test[0][0][0]) # # print('Option_test[2][0][0] = ', Option_test[2][0][0]) record = [] for opt in Option_test: record.append(model.predict([MFCC_test, opt])) record = np.array(record).T # print('record[0] = ', record[0]) ans = [] for buf in record: for sample in buf: # print(sample) # break sampleList = sample.tolist() ans.append(sampleList.index(max(sampleList))) # print('ans len = ', len(ans)) ###juice###