Example #1
0
class Fasttext_clf(BaseEstimator, ClassifierMixin):
    data_path = pkg_resources.resource_filename('addr_detector.model', 'ft_ad.ftz')
    def __init__(self, path=data_path):
        self.model = FastText(path)
        self.default = '0'

    def fit(self, X, y):
        return self

    def predict(self, X):
        results = []
        if isinstance(X, str):  #
            res=self.model.predict_single(X)
            results = results + [self.default if not res  else res]
        elif isinstance(X, list):
           # X=[(x) for x in X]
            res = self.model.predict(X)
            results = results + self.model.predict(X)
        return results

    def predict_proba(self, X):
        results = []
        if isinstance(X, str):  #
            results = results + [self.model.predict_proba_single(X)]
        elif isinstance(X, list):
            #X=[(x+'\n') for x in X]
            results = results + self.model.predict_proba(X)
        return results
Example #2
0
def collect_docs(p, lang_detection_model_name=None, lang='en'):

    if lang_detection_model_name != None:
        from pyfasttext import FastText
        model_path = SparkFiles.get(lang_detection_model_name)
        model = FastText(model_path)

    regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  #domain...
        r'localhost|'  #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$',
        re.IGNORECASE)

    result = []
    lines = list(p)
    indices = [i for i, line in enumerate(lines) if regex.search(line.strip())]
    for i in range(0, len(indices)):
        idx = indices[i]
        content = lines[idx + 1]
        paras = re.findall('<PAR>(.*?)</PAR>', content, re.DOTALL)

        if model:
            #filter only english paras
            langs = model.predict(paras)
            en_paras = list(filter(lambda p: lang in p[1], zip(paras, langs)))
            paras = list(map(lambda pair: pair[0], en_paras))

        if paras:
            url = lines[idx].strip()
            result.append((url, paras))

    return result
Example #3
0
t0 = time.time()
t1 = time.time()
# terminal command
# fasttext/fastText-0.1.0/fasttext supervised -input /media/sf_NLP/input/train_set.txt -output /media/sf_NLP/input/fasttext.model -label __label__ -lr 1.0 -minCount 3 -wordNgrams 1,2 -dim 2000
#classifier = fasttext.supervised("/media/sf_NLP/input/train_set.txt","/media/sf_NLP/input/#fasttext.model",label_prefix="__label__")
#print('train_use: ', time.time()-t1)
#t1 = time.time()

# loadModel
# classifier = fasttext.load_model('/media/sf_NLP/input/fasttext.model.bin', label_prefix='__label__')
classifier = FastText('/media/sf_NLP/input/fasttext.model.bin')

# test
# result = classifier.test("/media/sf_NLP/input/test_set.txt")
with open("/media/sf_NLP/input/test_set.txt") as fr:
    lines = fr.readlines()
labels_predict = [e[0] for e in classifier.predict(lines)] #预测输出结果为二维形式
# labels_predict = [e[0][0] for e in classifier.predict_proba(lines)]
print('prediction done: ', time.time()-t1)
t1 = time.time()

i = 0
fid0 = open('baseline.csv', 'w')
fid0.write('id,class'+'\n')
for item in labels_predict:
    fid0.write(str(i) + ',' + str(item)+'\n')
    i = i+1
fid0.close()
print('result generate done: ', time.time()-t1)
print('time use: ', time.time()-t0)
Example #4
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: pyfasttext 可以解决fasttext "C++扩展无法分配足够的内存"的问题。总结:1. 训练使用原版fasttext库;2.预测使用pyfasttext
"""

from pyfasttext import FastText
model = FastText('classify_model.bin')
print(model.labels)
print(model.nlabels)
texts = [
    '吃 什么 止泻 快 _ 宝宝 拉肚子 _ 酸味 重 _ 专题 解答 ',
    '增高 _ 正确 长高 方法 _ 刺激 骨骼 二次 生长发育   增高 精准 找到 长高 办法   ,   有助 孩子 长高 的 方法   ,'
]

# Or with the probability
labels = model.predict_proba(texts, k=2)
print(labels)

print(model.predict(texts, k=1))
Example #5
0
"""

# # Option_test = np.resize(Option_test, (4, 2000, 13))

# # print('Option_test[0][0][0] = ', Option_test[0][0][0])
# # print('Option_test[0][2][0] = ', Option_test[0][2][0])

Option_test = np.transpose(Option_test, [1, 0, 2, 3])

# # print('Option_test[0][0][0] = ', Option_test[0][0][0])
# # print('Option_test[2][0][0] = ', Option_test[2][0][0])

record = []

for opt in Option_test:
    record.append(model.predict([MFCC_test, opt]))

record = np.array(record).T
# print('record[0] = ', record[0])

ans = []
for buf in record:
    for sample in buf:
        # print(sample)
        # break
        sampleList = sample.tolist()
        ans.append(sampleList.index(max(sampleList)))

# print('ans len = ', len(ans))

###juice###