Beispiel #1
0
def train():
    classifier = ft.train_supervised(train_path)
    model = classifier.save_model(model_path)
    test = classifier.test(test_path)
    print("准确率:", test.precision)
    print("回归率:", test.recall)
    classifier.get_labels()
Beispiel #2
0
    def train_classifier(self):
        # self.load_train_dataset()
        #实验后的最佳参数之一

        start_time = time.time()
        classifier = ff.train_supervised(
            self.data_path, lr=0.1, loss='hs', wordNgrams=2,
            epoch=300)  # epoch=20,0.91;epoch=50,0.93;
        model = classifier.save_model(
            self.model_save_path + 'level_2_fasttext_classifier_big_big.model'
        )  # 保存模型  all:0.91;all_2:0.93
        classifier.get_labels()  # 输出标签
        # 测试模型
        # print('加载fasttext模型--{}'.format('level_1_fasttext_classifier_big_test.model'))
        # classifier = ff.load_model(self.model_save_path+'level_1_fasttext_classifier_big_test.model')
        test_result = classifier.test(self.test_save_path + 'test_big.txt')
        result_str = 'test precision:{}\n'.format(test_result)
        print(result_str)

        end_time = time.time()
        load_time = round(end_time - start_time, 3)
        train_time_str = 'train and test model time %fs' % load_time
        print(train_time_str)

        save_file(self.result_save_path + 'fasttext_result_big.txt',
                  result_str + train_time_str + '\n', 'a')
Beispiel #3
0
def fastText_classifier(train_data,model_save_path):
    classifier=ff.train_supervised(train_data+'book_sub_level_1_train.txt',lr=0.1,loss='hs',wordNgrams=2,epoch=300)
    model = classifier.save_model(model_save_path+'book_sub_classifier.model') # 保存模型
    classifier.get_labels() # 输出标签
    result = classifier.test(train_data+'book_sub_level_1_train.txt')
    print(result)
    
    '''
def trainFT(path: str, n=1):
    """  """
    clf = FT.train_supervised(path,
                              epoch=100,
                              dim=100,
                              wordNgrams=n,
                              label='__label__',
                              loss='softmax')
    return clf
Beispiel #5
0
def train_fasttext_win(inputPath='news_fasttext/news_fasttext_train.txt',
                       savePath='model.m',
                       label='__label__'):
    if not os.path.exists(savePath):
        print('train model...')
        classifier = ff.train_supervised(inputPath, label=label)
        classifier.save_model(savePath)  # 保存模型
    else:
        classifier = ff.load_model('model.m')  # 读取模型
    print('loaded model...')
    return classifier
def fast_text_model(X_test):
    '''
        使用fasttext进行文本分类
    '''
    # 分类训练
    classifier = ff.train_supervised('train.txt', label='__label__')
    # 模型预测,返回预测标签和概率
    label, prob = classifier.predict(X_test)
    # 根据给定数据集对模型进行评价,返回样本个数、准确率、召回率
    result = classifier.test('test.txt')
    return label, prob, result
Beispiel #7
0
def test_fasttext(train_path, test_path, model_save_path):
    classifier = ff.train_supervised(train_path + 'A_train.txt',
                                     lr=0.1,
                                     loss='hs',
                                     wordNgrams=2,
                                     epoch=50)
    #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt')
    model = classifier.save_model(model_save_path +
                                  'A_train_classifier2.model')  # 保存模型
    classifier.get_labels()  # 输出标签
    result = classifier.test(test_path + 'A_test.txt')
    print(result)
Beispiel #8
0
def train_fasttext(inputPath='train.txt',
                   savePath='./model.m',
                   label='__label__'):
    if not os.path.exists(savePath):
        print('train...')
        classfication = ft.train_supervised(inputPath, label=label)
        classfication.save_model(savePath)
    else:
        classfication = ft.load_model(savePath)

    print('load model...')

    return classfication
Beispiel #9
0
def fasttext_model_train():
    """
    fasttext模型训练
    :return:
    """
    for i in range(5, 51):
        for w in range(1, 3):
            start_time = time.time()
            classifier = ff.train_supervised("fasttext.train",
                                             epoch=i,
                                             lr=0.5,
                                             wordNgrams=w)
            print("ngram=%d,训练第%d轮,用时%s" % (w, i, time.time() - start_time))
            classifier.save_model("Model/model_w" + str(w) + "_e" + str(i))
def fastText_classifier(train_data, model_save_path, result_save_path):
    classifier = ff.train_supervised(train_data + 'level_3_train.txt',
                                     lr=0.1,
                                     loss='hs',
                                     wordNgrams=2,
                                     epoch=150)
    model = classifier.save_model(model_save_path +
                                  'level_3_classifier.model')  # 保存模型
    classifier.get_labels()  # 输出标签
    result = classifier.test(train_data + 'level_3_train.txt')
    print(result)
    with open(result_save_path + 'train3_results.txt', 'w') as fp:
        fp.write(str(result))
        fp.write('\n')
    '''
def fastText_classifier(train_data,test_data,model_save_path,result_save_path):
    files = []
    results = []
    if not os.path.exists(train_data):
        os.makedirs(train_data)
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    if not os.path.exists(result_save_path):
        os.makedirs(result_save_path)

    for level_one,test_l1 in zip(os.listdir(train_data),os.listdir(test_data)):
        print(level_one+'-->'+test_l1)

        if '.txt' in level_one and '.txt' in test_l1:
            classifier=ff.train_supervised(train_data+level_one,、lr=0.1,loss='hs',wordNgrams=2,epoch=100)
            #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt')
            model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型
            classifier.get_labels() # 输出标签
            result = classifier.test(test_data+test_l1)
            files.append(level_one)
            results.append(result)
            print(result)
        else:
            data_list = os.listdir(train_data+level_one+'/')
            test_list = os.listdir(test_data+test_l1+'/')
            if not len(data_list) or not len(test_data):
                continue
            classifier=ff.train_supervised(train_data+level_one+'/'+data_list[0],lr=0.1,loss='hs',wordNgrams=2,epoch=50)
            #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt')
            model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型
            classifier.get_labels() # 输出标签
            result = classifier.test(test_data+test_l1+'/'+test_list[0])
            files.append(data_list[0])
            results.append(result)
            print(result)
    print(files)
    print(results)

    with open(result_save_path+'train_results.txt','w') as fp:
        for i,j in zip(files,results):
            fp.write(str(i)+'-->'+str(j))
            fp.write('\n')
    '''
    def fasttext_train(self):
        model_file = os.path.join(self.home_data, 'fasttext.model')
        if os.path.exists(model_file):
            print("Fasttext模型已经存在,直接载入")
            classifier = ft.load_model(self.home_data + 'fasttext.model')
        else:
            print("Fasttext模型不存在,训练")
            import fastText.FastText as ft
            classifier = ft.train_supervised(self.home_data+"fasttext_train.txt")  # 训练模型

            model = classifier.save_model(self.home_data+'fasttext.model')  # 保存模型
            classifier = ft.load_model(self.home_data+'fasttext.model')  # 导入模型
        result = classifier.test(self.home_data+"fasttext_test.txt")  # 输出测试结果
        labels = classifier.get_labels()  # 输出标签
        print("测试实例数", result[0])  # 实例数
        print("准确率", result[1])  # 全部的准确率
        print("召回率", result[2])  # 召回率
        logging.info('测试实例数 %s' % str(result[0]))
        logging.info('准确率 %s' % str(result[1]))
        logging.info('召回率 %s' % str(result[2]))
Beispiel #13
0
 def train(self, vecSize, winSize, epochs, minCount, lossFunction,
           sampleThreshold, learnRate, ngrams, wordGrams, bucket):
     self.model = ft.train_supervised(
         input=self.trainingFile,
         lr=learnRate,
         dim=vecSize,
         ws=winSize,
         epoch=epochs,
         minCount=minCount,
         loss=("ns" if lossFunction < 0 else
               ("softmax" if lossFunction == 0 else "hs")),
         neg=(-lossFunction if lossFunction < 0 else 0),
         t=sampleThreshold,
         minn=ngrams // 2,
         maxn=ngrams,
         wordNgrams=wordGrams,
         bucket=bucket,
         verbose=0,
         thread=4  #8
     )
Beispiel #14
0
class TCFastText(object):


texts, labels = read("./data/cut_data.txt")

labels = list(map(lambda x: "__label__" + str(x), labels))

data = zip(texts, labels)


data = list(map(lambda x: " ".join(x), data))
data = np.array(data)


kf = KFold(n_splits=8)

for train_index, test_index in kf.split(data):

    print("Train:", train_index, "Test:", test_index)

    train = data[train_index]
    test = data[test_index]

    with open("data/ft_train", "w") as f:
        f.write("\n".join(train))

    with open("data/ft_test", "w") as f:
        f.write("\n".join(test))

    #  ft = FastText.train_supervised("data/ft_train", dim=128, epoch=60, minCount=4, wordNgrams=5, label="__label__")
    ft = FastText.train_supervised("data/ft_train", dim=128, epoch=60, minCount=5, wordNgrams=3, label="__label__")
    #  ft = FastText.train_supervised("data/ft_train", dim=80, epoch=60, minCount=5, wordNgrams=3, label="__label__")

    result = ft.test("data/ft_test")

    ft.save_model("fastText")

    print(result)

    break
#十折交叉验证法检验最终模型的准确率
precision=[]

for i in range(0,len(result)):
    f_train = open(os.path.join(filename, "original/train.tsv"), 'w', encoding="utf-8")#将训练集分成训练文件与测试文件
    f_test = open(os.path.join(filename, "original/test.tsv"), 'w', encoding="utf-8")#此处自动生成
    a=result[i]
    list_train=random.sample(list(set(resultList)-set(a)),(len(resultList)-len(a)))
    for x in list_train:
        f_train.write(linecache.getline(train_new,x))
    for y in a:
        f_test.write(linecache.getline(train_new,y))
    f_train.close()
    f_test.close()
    start=time.time()
    classifier = ff.train_supervised(os.path.join(filename, 'original/train.tsv'), dim=64, lr=0.7, wordNgrams=2,
                                    minCount=2,bucket=10000000,label = '__label__',thread = 20,epoch=7)#训练代码
    model=classifier.save_model(os.path.join(filename,'original/model/model'+str(i+1)+'.model')) # 保存模型
    test = classifier.test(os.path.join(filename, 'original/test.tsv'), k=1)#测试
    end=time.time()
    precision.append((test[1],end-start))
    print('模型预测准确率:', test[1])
    print("训练时间为:",end-start)

sum_precision=0
sum_time=0
for i,t in precision:
    sum_precision+=i
    sum_time+=t

#用于删除本程序中生成的训练文件,测试文件,这两文件每次运行均会生成,最终删除
os.remove(os.path.join(filename,"original/train.tsv"))
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("D:\\new_data\\new_data\\train_set.csv")
data = data.loc[:, ['word_seg', 'class']]
data['label'] = data.apply(lambda x: 'label' + str(x[1]), axis=1)
x_train, x_test = train_test_split(data, test_size=0.3, random_state=42)

x_train.loc[:, ['word_seg', 'label']].to_csv(
    "D:\\new_data\\new_data\\train_set1.txt",
    index=False,
    header=None,
    sep='\t')
x_test.loc[:, ['word_seg', 'label']].to_csv(
    "D:\\new_data\\new_data\\test_set1.txt",
    index=False,
    header=None,
    sep='\t')

import fastText.FastText as ff

classifier = ff.train_supervised('D:\\new_data\\new_data\\train_set1.txt',
                                 label="label")

result = classifier.test("D:\\new_data\\new_data\\test_set1.txt")
Beispiel #17
0
# _*_coding:utf-8 _*_
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
import fastText.FastText as fasttext
#训练模型
classifier = fasttext.train_supervised("data/train/fastText_train.txt",
                                       label="__label__")

#保存模型
classifier.save_model('models/fasttext_train.model.bin')

labels_right = []
texts = []
labels_predict = []
with open("data/train/fastText_train.txt") as fr:
    for line in fr:
        line = line.decode("utf-8").rstrip()
        label_right = line.split("\t")[1]
        labels_right.append(label_right)
        text = line.split("\t")[0]
        texts.append(text)
        label_predict = classifier.predict(text)
        labels_predict.append(label_predict[0])
        print("文本")
        print(line)
        print("真实label")
        print(label_right)
        print("预测label")
        print(label_predict[0])
Beispiel #18
0
    maxn              # max length of char ngram [0]
    neg               # number of negatives sampled [5]
    wordNgrams        # max length of word ngram [1]
    loss              # 损失函数 {ns, hs, softmax, ova} [softmax]
    bucket            # number of buckets [2000000]
    thread            # 线程数 [number of cpus]
    lrUpdateRate      # 学习率更新速率 [100]
    t                 # sampling threshold [0.0001]
    label             # 标签前缀 ['__label__']
    verbose           # verbose [2]
    pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []

"""
model = ft.train_supervised("train.txt",
                            lr=1,
                            dim=300,
                            epoch=5,
                            wordNgrams=4,
                            loss='hs')
model.save_model("model_file.bin")


def print_results(N, p, r):
    print("N\t" + str(N))  # 预测错的例子
    # P: 准确率   R: 召回率
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))


print_results(*model.test('test.txt'))
Beispiel #19
0
import csv

#def transfercsv_to_fastText(csv_path,fastText_file):
path = r"data\Chinese\Chinese raw data\seg_test.tsv"
with open(r'data\Chinese\Chinese fasttext data\seg_test',
          'w',
          encoding='utf_8') as t:
    with open(path, 'r', encoding='utf_8') as f:
        lines = csv.reader(f, delimiter='\t')
        for line in lines:
            target = line[2]
            content = line[1]
            t.write(content + '\t' + '_label_' + target + '\n')

#训练模型
classifier = ff.train_supervised(
    r'data\Chinese\Chinese fasttext data\seg_train', label='_label_')
#储存模型
classifier.save_model(
    r'data\Chinese\Chinese fasttext data\fastText_model1')  #保存模型
#加载模型
classifier = ff.load_model(
    r'data\Chinese\Chinese fasttext data\fastText_model1')
#测试模型
correct = 0
total_count = 0
with open(r'data\Chinese\Chinese fasttext data\seg_test',
          'r',
          encoding='utf_8') as t:
    lines = t.readlines()
    total_count = len(lines)
    print(total_count)
def fast_text_train(data_file, model_file, test_file):
    classifier = ff.train_supervised(data_file)
    classifier.save_model(model_file)  # 保存模型
    test = classifier.test(test_file, 1)  # 输出测试结果
    b = test.precision
    a = 1
def train_model():
    start_time = time.time()
    all_marco_precision = []
    all_marco_recall = []
    all_marco_f1 = []
    all_micro_precision = []
    all_micro_recall = []
    all_micro_f1 = []
    for i in range(5, 51):
        classifier = ff.train_supervised("fastText/train_data",
                                         epoch=i,
                                         lr=0.5)
        classifier.save_model("fastText/model/train")
        print("模型构建时间:%s s" % str(time.time() - start_time))

        # 因为fasttext中设计的是针对多标签的精确率与召回率,对于单标签,计算结果一致,不具有参考价值
        # print("积极数据测试:")
        # test = classifier.test('fastText/test_data_positive')
        # print("测试数据数量:%d\t准确率:%f\t召回率:%f" % (test[0], test[1], test[2]))
        # print("中立数据测试:")
        # test = classifier.test('fastText/test_data_neutral')
        # print("测试数据数量:%d\t准确率:%f\t召回率:%f" % (test[0], test[1], test[2]))
        # print("消极数据测试:")
        # test = classifier.test('fastText/test_data_negative')
        # print("测试数据数量:%d\t准确率:%f\t召回率:%f" % (test[0], test[1], test[2]))

        correct_labels = [
            line.strip().split(" , ")[0] for line in open(
                'fastText/test_data', "r", encoding="utf-8").readlines()
        ]
        texts = [
            line.strip().split(" , ")[1] for line in open(
                'fastText/test_data', "r", encoding="utf-8").readlines()
        ]
        predict_labels = classifier.predict(texts)[0]
        true_positive = 0
        false_positive = 0
        false_negative = 0
        evaluation_parameters = []
        labels = {"__label__-1": "消极", "__label__0": "中立", "__label__1": "积极"}
        for label, name in labels.items():
            evaluate_p = {}
            print("%s标签测试结果:" % name)
            evaluate_p["name"] = name
            evaluate_p["nexample"] = len(texts)
            for i in range(len(texts)):
                # 预测属于该类,实际属于该类
                if predict_labels[i] == label and correct_labels[i] == label:
                    true_positive += 1
                # 预测属于该类,实际不属于该类
                elif predict_labels[i] == label and correct_labels[i] != label:
                    false_positive += 1
                # 预测不属于该类,实际属于该类
                elif predict_labels[i] != label and correct_labels[i] == label:
                    false_negative += 1
            evaluate_p["true_positive"] = true_positive
            evaluate_p["false_positive"] = false_positive
            evaluate_p["false_negative"] = false_negative
            # 计算精确率、召回率、F值
            precision = true_positive / (true_positive + false_positive)
            evaluate_p["precision"] = precision
            recall = true_positive / (true_positive + false_negative)
            evaluate_p["recall"] = recall
            f1 = 2 * precision * recall / (precision + recall)
            evaluate_p["f1"] = f1
            evaluation_parameters.append(evaluate_p)
            print("测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" %
                  (len(texts), precision, recall, f1))
        # 计算宏平均和微平均
        sum_precision = 0
        sum_recall = 0
        sum_true_positive = 0
        sum_false_positive = 0
        sum_false_negative = 0
        for p in evaluation_parameters:
            sum_precision += p["precision"]
            sum_recall += p["recall"]
            sum_true_positive += p["true_positive"]
            sum_false_positive += p["false_positive"]
            sum_false_negative += p["false_negative"]
        n = len(evaluation_parameters)
        marco_precision = sum_precision / n
        all_marco_precision.append(marco_precision)
        marco_recall = sum_recall / n
        all_marco_recall.append(marco_recall)
        marco_f1 = 2 * marco_precision * marco_recall / (marco_precision +
                                                         marco_recall)
        all_marco_f1.append(marco_f1)
        print("宏平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" %
              (len(texts), marco_precision, marco_recall, marco_f1))
        micro_true_positive = sum_true_positive / n
        micro_false_positive = sum_false_positive / n
        micro_false_negative = sum_false_negative / n
        micro_precision = micro_true_positive / (micro_true_positive +
                                                 micro_false_positive)
        all_micro_precision.append(micro_precision)
        micro_recall = micro_true_positive / (micro_true_positive +
                                              micro_false_negative)
        all_micro_recall.append(micro_recall)
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision +
                                                         micro_recall)
        all_micro_f1.append(micro_f1)
        print("微平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" %
              (len(texts), micro_precision, micro_recall, micro_f1))

    names = [i for i in range(5, 51)]
    ax1 = plt.subplot(311)
    plt.plot(names, all_marco_precision, label='marco-P')
    plt.plot(names, all_micro_precision, label='micro-P')
    plt.legend(loc='upper left')
    ax2 = plt.subplot(312, sharey=ax1)
    plt.plot(names, all_marco_recall, label='marco-P')
    plt.plot(names, all_micro_recall, label='micro-R')
    plt.legend(loc='upper left')
    plt.subplot(313, sharey=ax1)
    plt.plot(names, all_marco_f1, label='marco-F1')
    plt.plot(names, all_micro_f1, label='micro-F1')
    plt.legend(loc='upper left')
    plt.xlabel(u"训练轮数(ngram=1)")
    plt.savefig('./ngram1.png')
    plt.show()
            r = float(A[key]) / float(B[key])
            p = float(A[key]) / float(C[key])
            f = p * r * 2 / (p + r)
            logging.info("%s:\t p:%f\t r:%f\t f:%f" % (key, p, r, f))
        except:
            logging.error("error:", key, "right:", A.get(key, 0), "real:",
                          B.get(key, 0), "predict:", C.get(key, 0))


if __name__ == "__main__":
    base_dir = data_path.metaphor_data_base_dir
    filename_train = 'metaphor_recognition.fasttext.train'
    filename_validation = 'metaphor_recognition.fasttext.validation'

    # base_dir = r'/home/liyuncong/program/fasttext/data/'
    # filename_train = 'news_fasttext_train.txt'
    # filename_validation = 'news_fasttext_test.txt'

    filename_model = 'metaphor_recognition.fasttext'

    train_data = os.path.join(base_dir, filename_train)
    valid_data = os.path.join(base_dir, filename_validation)

    # train_supervised uses the same arguments and defaults as the fastText cli
    model = ff.train_supervised(train_data, epoch=25, lr=1.0, wordNgrams=2)

    test_result = model.test(valid_data)
    print_results(*test_result)

    __predict(model, valid_data)
import fastText.FastText as ff
import jieba
#使用训练数据集进行有监督的训练
classifier = ff.train_supervised("data/train.txt")
#模型进行持久化
model = classifier.save_model('data/try.model')