Beispiel #1
0
def train_fasttext_win(inputPath='news_fasttext/news_fasttext_train.txt',
                       savePath='model.m',
                       label='__label__'):
    if not os.path.exists(savePath):
        print('train model...')
        classifier = ff.train_supervised(inputPath, label=label)
        classifier.save_model(savePath)  # 保存模型
    else:
        classifier = ff.load_model('model.m')  # 读取模型
    print('loaded model...')
    return classifier
Beispiel #2
0
def train_fasttext(inputPath='train.txt',
                   savePath='./model.m',
                   label='__label__'):
    if not os.path.exists(savePath):
        print('train...')
        classfication = ft.train_supervised(inputPath, label=label)
        classfication.save_model(savePath)
    else:
        classfication = ft.load_model(savePath)

    print('load model...')

    return classfication
Beispiel #3
0
def create_predict(HudongItem_csv):
    # 读取neo4j内容
    db = Neo4j()
    db.connectDB()

    predict_List = readCSVbyColumn(HudongItem_csv, 'title')
    file_object = open('vector.txt', 'a')

    model = FastText('wiki.zh.bin')

    count = 0
    vis = set()
    for p in predict_List:
        cur = HudongItem(db.matchHudongItembyTitle(p))
        count += 1
        title = cur.title
        if title in vis:
            continue
        vis.add(title)
        wv_list = model[title]
        strr = str(title)
        print('===============')

        print(strr)
        for p in wv_list:
            strr += ' ' + str(p)[:7]
        print('===============')
        print(strr)

        print('===============')
        file_object.write(strr + "\n")
        print(str(count) + ' / ' + str(len(predict_List)))

    file_object.close()
def test():
    '''  '''
    source = 'jieba_v3_ft_model_2.ft'
    target = 'jieba_test_fasttext_v3.txt'
    clf = FT.load_model(os.path.join(const.MODELPATH, source))
    size, precision, recall = clf.test(os.path.join(const.DATAPATH, target))
    print('accuracy score : ', precision)
def get_fasttext_tweet_embedding(
        tweet_word_list: List[str],
        fasttext_model: fastText.FastText) -> np.ndarray:
    all_embeddings = []
    for word in tweet_word_list:
        all_embeddings.append(fasttext_model.get_word_vector(word))
    return np.mean(all_embeddings, axis=0)
Beispiel #6
0
    def train_classifier(self):
        # self.load_train_dataset()
        #实验后的最佳参数之一

        start_time = time.time()
        classifier = ff.train_supervised(
            self.data_path, lr=0.1, loss='hs', wordNgrams=2,
            epoch=300)  # epoch=20,0.91;epoch=50,0.93;
        model = classifier.save_model(
            self.model_save_path + 'level_2_fasttext_classifier_big_big.model'
        )  # 保存模型  all:0.91;all_2:0.93
        classifier.get_labels()  # 输出标签
        # 测试模型
        # print('加载fasttext模型--{}'.format('level_1_fasttext_classifier_big_test.model'))
        # classifier = ff.load_model(self.model_save_path+'level_1_fasttext_classifier_big_test.model')
        test_result = classifier.test(self.test_save_path + 'test_big.txt')
        result_str = 'test precision:{}\n'.format(test_result)
        print(result_str)

        end_time = time.time()
        load_time = round(end_time - start_time, 3)
        train_time_str = 'train and test model time %fs' % load_time
        print(train_time_str)

        save_file(self.result_save_path + 'fasttext_result_big.txt',
                  result_str + train_time_str + '\n', 'a')
Beispiel #7
0
def mode_predict_fasttext(config, input_path):
    """使用训练好的fasttext模型预测 数据
    """
    input_path = os.path.join(input_path, 'data_predict.txt')
    ftmodel_path = os.path.join(config.log_root, config.model_name, "train", "ftmodel.bin")
    # 最终预测值的保存路径
    output_path = os.path.join("output", "predict_result.txt")

    # 载入训练好的模型
    ftmodel = fasttext.load_model(ftmodel_path)
    print('开始预测数据')
    with open(input_path, 'r', encoding='utf8') as fin:
        lines = fin.readlines()
        lines = [x.strip() for x in lines]  # 去掉每行头尾空白
        if config.is_python_package:
            # python版本只有 predict_proba 方法才能给出预测概率
            pred_pairs = ftmodel.predict_proba(lines)
            predict_list = [x[0] for x in pred_pairs]
        else:
            # facebook的python版本是返回两个列表
            pred_list, pred_prob_list = ftmodel.predict(lines)
            predict_list = [(pred[0], pred_prob[0]) for (pred, pred_prob) in zip(pred_list, pred_prob_list)]

    # 得到的predict_list的格式:[(预测类别, 预测概率), (预测类别, 预测概率)...]
    # 保存预测值到output文件夹
    with open(output_path, "w", encoding="utf8") as fout:
        for (pred, pred_prob) in predict_list:
            pred = re.split("__", pred)[-1]     # 去掉前缀__label__
            fout.write("%s\t%f\n" % (pred, pred_prob))
    print('预测完成,已将预测值写入输出文件:', output_path)
Beispiel #8
0
 def load_fasttext(self, model_name):
     start_time = time.time()
     classifier = ff.load_model(self.model_save_path + model_name)
     end_time = time.time()
     pre_time = round(end_time - start_time, 3)
     print('加载fasttext模型时间: %f' % pre_time)
     return classifier
 def pre_load_fastText_model(self):
     """
     预加载fastText模型
     :return:
     """
     os.chdir(os.path.dirname(os.path.abspath(__file__)))
     self.classifier = ff.load_model("model_w2_e24")
Beispiel #10
0
def train():
    classifier = ft.train_supervised(train_path)
    model = classifier.save_model(model_path)
    test = classifier.test(test_path)
    print("准确率:", test.precision)
    print("回归率:", test.recall)
    classifier.get_labels()
def load_model(model_path):
    # 加载模型
    # 加载windows模型
    import fastText.FastText as ff
    classifier = ff.load_model(model_path)

    return classifier
Beispiel #12
0
    def load_words_model(self, model_filename):

        model_filename = self.data_dir + model_filename
        self.model = fasttext.load_model(model_filename + '.bin')
        self.word_dim = self.model.get_dimension()
        self.words_list = list(self.model.get_words())
        print('Loaded. Dictionary size:%s' %
              '{:,}'.format(len(self.model.get_words())))
def trainUnSpv(path: str, n=1, model='skipgram'):
    """  """
    m = FT.train_unsupervised(path,
                              epoch=100,
                              dim=100,
                              wordNgrams=n,
                              model=model)
    return m
Beispiel #14
0
def fastText_classifier(train_data,model_save_path):
    classifier=ff.train_supervised(train_data+'book_sub_level_1_train.txt',lr=0.1,loss='hs',wordNgrams=2,epoch=300)
    model = classifier.save_model(model_save_path+'book_sub_classifier.model') # 保存模型
    classifier.get_labels() # 输出标签
    result = classifier.test(train_data+'book_sub_level_1_train.txt')
    print(result)
    
    '''
def trainFT(path: str, n=1):
    """  """
    clf = FT.train_supervised(path,
                              epoch=100,
                              dim=100,
                              wordNgrams=n,
                              label='__label__',
                              loss='softmax')
    return clf
    def fasttext_train(self):
        model_file = os.path.join(self.home_data, 'fasttext.model')
        if os.path.exists(model_file):
            print("Fasttext模型已经存在,直接载入")
            classifier = ft.load_model(self.home_data + 'fasttext.model')
        else:
            print("Fasttext模型不存在,训练")
            import fastText.FastText as ft
            classifier = ft.train_supervised(self.home_data+"fasttext_train.txt")  # 训练模型

            model = classifier.save_model(self.home_data+'fasttext.model')  # 保存模型
            classifier = ft.load_model(self.home_data+'fasttext.model')  # 导入模型
        result = classifier.test(self.home_data+"fasttext_test.txt")  # 输出测试结果
        labels = classifier.get_labels()  # 输出标签
        print("测试实例数", result[0])  # 实例数
        print("准确率", result[1])  # 全部的准确率
        print("召回率", result[2])  # 召回率
        logging.info('测试实例数 %s' % str(result[0]))
        logging.info('准确率 %s' % str(result[1]))
        logging.info('召回率 %s' % str(result[2]))
Beispiel #17
0
def load_model(model_path, os_name):
    #加载模型
    #加载windows模型
    if os_name == 'nt':
        import fastText.FastText as ff
        classifier = ff.load_model(model_path)
    else:  #加载linux模型
        import fasttext
        classifier = fasttext.load_model(model_path, label_prefix='__label__')

    return classifier
def fast_text_model(X_test):
    '''
        使用fasttext进行文本分类
    '''
    # 分类训练
    classifier = ff.train_supervised('train.txt', label='__label__')
    # 模型预测,返回预测标签和概率
    label, prob = classifier.predict(X_test)
    # 根据给定数据集对模型进行评价,返回样本个数、准确率、召回率
    result = classifier.test('test.txt')
    return label, prob, result
Beispiel #19
0
def mode_train_fasttext(config, _input_path):
    """训练fasttext模型并保存训练好的模型
    """
    input_path = os.path.join(_input_path, 'data_train.txt')
    output_path = os.path.join(config.log_root, config.model_name, "train", "ftmodel")
    pretrained_path = os.path.join(config.log_root, config.model_name, "train", "word_vector.vec")
    # 执行训练命令
    print('开始训练fasttext模型')
    if config.is_python_package:
        # python版必须指定bucket, 未修复的bug..  bucket增加多少倍,保存的模型大小大致也会增加多少倍
        fasttext.supervised(input_file=input_path, output=output_path,
                    epoch=config.train_epoch, lr=config.learning_rate, dim=config.dim, word_ngrams=config.wordNgrams,
                    loss=config.loss, pretrained_vectors=pretrained_path, bucket=200000)
    else:
        train_command = '{}fasttext supervised '.format(fasttext_path) + \
            '-input {} -output {} '.format(input_path, output_path) + \
            '-epoch {} -lr {} -dim {} -wordNgrams {} '.format(config.train_epoch, config.learning_rate, config.dim, config.wordNgrams)+ \
            '-loss {} -verbose {} -pretrainedVectors {}'.format(config.loss, config.verbose, pretrained_path)
            # '-loss {} -verbose {}'.format(config.loss, config.verbose)
        os.system(train_command)
    print('训练fasttext模型完毕')
Beispiel #20
0
def test_fasttext(train_path, test_path, model_save_path):
    classifier = ff.train_supervised(train_path + 'A_train.txt',
                                     lr=0.1,
                                     loss='hs',
                                     wordNgrams=2,
                                     epoch=50)
    #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt')
    model = classifier.save_model(model_save_path +
                                  'A_train_classifier2.model')  # 保存模型
    classifier.get_labels()  # 输出标签
    result = classifier.test(test_path + 'A_test.txt')
    print(result)
def make_model(data_path, data_file_name, save_path, save_file_name, **kwargs):
    total_data_path = os.path.join(data_path, data_file_name)
    total_save_path = os.path.join(save_path, save_file_name)

    assert os.path.isfile(total_data_path) == True
    start_time = strftime("%y%m%d-%H%M%S")
    print('모델생산 시작시간 : ', start_time)

    try:
        if kwargs.keys() == None:
            model = FastText.train_unsupervised(total_data_path)
        else:
            model = FastText.train_unsupervised(total_data_path, **kwargs)

        model.save_vectors(total_save_path)

    except Exception as e:
        print(e)
    finally:
        end_time = strftime("%y%m%d-%H%M%S")
        print('모델생산 종료시간 : ', end_time)
    return total_save_path + '.vec'
Beispiel #22
0
def fasttext_model_train():
    """
    fasttext模型训练
    :return:
    """
    for i in range(5, 51):
        for w in range(1, 3):
            start_time = time.time()
            classifier = ff.train_supervised("fasttext.train",
                                             epoch=i,
                                             lr=0.5,
                                             wordNgrams=w)
            print("ngram=%d,训练第%d轮,用时%s" % (w, i, time.time() - start_time))
            classifier.save_model("Model/model_w" + str(w) + "_e" + str(i))
Beispiel #23
0
def mode_evaluate_fasttext(config, input_path):
    """测试训练好的fasttext模型
    """
    input_path = os.path.join(input_path, 'data_test.txt')
    ftmodel_path = os.path.join(config.log_root, config.model_name, "train", "ftmodel.bin")
    # 执行测试命令
    if config.is_python_package:
        ftmodel = fasttext.load_model(ftmodel_path)
        test_log = ftmodel.test(input_path)
        print('测试集的测试结果:\nN\t{}\nP@1\t{}\nR@1\t{}\n'.format(test_log.nexamples, test_log.precision, test_log.recall))
    else:
        test_command = '{}fasttext test '.format(fasttext_path) + \
            '{} {}'.format(ftmodel_path, input_path)
        test_log = os.popen(test_command)
        print('测试集的测试结果:\n{}\n'.format(test_log.read()))
def load_fastText_vectors_char(model, vocab):
    m = FastText.load_model(model)

    word_vecs = {}

    for subword in vocab:
        if subword not in word_vecs:
            word_vecs[subword] = m.get_word_vector(subword)

    # for subword in vocab:
    #     if subword not in word_vecs:
    #         index = m.get_subword_id(subword)
    #         word_vecs[subword] = m.get_input_vector(index)

    return word_vecs
def fastText_classifier(train_data, model_save_path, result_save_path):
    classifier = ff.train_supervised(train_data + 'level_3_train.txt',
                                     lr=0.1,
                                     loss='hs',
                                     wordNgrams=2,
                                     epoch=150)
    model = classifier.save_model(model_save_path +
                                  'level_3_classifier.model')  # 保存模型
    classifier.get_labels()  # 输出标签
    result = classifier.test(train_data + 'level_3_train.txt')
    print(result)
    with open(result_save_path + 'train3_results.txt', 'w') as fp:
        fp.write(str(result))
        fp.write('\n')
    '''
Beispiel #26
0
def pretrained_vectors(config, input_path):
    """预训练词向量
    """
    input_path = os.path.join(input_path, 'data_train.txt')
    output_path = os.path.join(config.log_root, config.model_name, "train", "word_vector")
    # 执行训练命令
    print('开始预训练词向量(仅需要训练一次。一般来说需要一些时间..) ')
    if config.is_python_package:
        model = fasttext.skipgram(input_file=input_path, output=output_path,
                          epoch=config.train_epoch, dim=config.dim)
        # print(model.words)
    else:
        train_command = '{}fasttext skipgram '.format(fasttext_path) + \
                        '-input {} -output {} '.format(input_path, output_path) + \
                        '-epoch {} -dim {} '.format(config.train_epoch, config.dim)
        os.system(train_command)
    print('预训练词向量完毕')
def fastText_classifier(train_data,test_data,model_save_path,result_save_path):
    files = []
    results = []
    if not os.path.exists(train_data):
        os.makedirs(train_data)
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)
    if not os.path.exists(result_save_path):
        os.makedirs(result_save_path)

    for level_one,test_l1 in zip(os.listdir(train_data),os.listdir(test_data)):
        print(level_one+'-->'+test_l1)

        if '.txt' in level_one and '.txt' in test_l1:
            classifier=ff.train_supervised(train_data+level_one,、lr=0.1,loss='hs',wordNgrams=2,epoch=100)
            #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt')
            model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型
            classifier.get_labels() # 输出标签
            result = classifier.test(test_data+test_l1)
            files.append(level_one)
            results.append(result)
            print(result)
        else:
            data_list = os.listdir(train_data+level_one+'/')
            test_list = os.listdir(test_data+test_l1+'/')
            if not len(data_list) or not len(test_data):
                continue
            classifier=ff.train_supervised(train_data+level_one+'/'+data_list[0],lr=0.1,loss='hs',wordNgrams=2,epoch=50)
            #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt')
            model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型
            classifier.get_labels() # 输出标签
            result = classifier.test(test_data+test_l1+'/'+test_list[0])
            files.append(data_list[0])
            results.append(result)
            print(result)
    print(files)
    print(results)

    with open(result_save_path+'train_results.txt','w') as fp:
        for i,j in zip(files,results):
            fp.write(str(i)+'-->'+str(j))
            fp.write('\n')
    '''
def sentence_input(sentence):
    classifier = ff.load_model("fastText/model/train")
    stop_words = load_stop_words()
    seg_line = jieba.cut(sentence)
    positive_emoticons = load_positive_emoticons()
    negative_emoticons = load_negative_emoticons()
    punctuation_words = load_punctuation_words()
    stop_words = load_stop_words()
    positive_words = load_positive_words()
    negative_words = load_negative_words()
    # print(sentence)
    # 识别其中的表情符,并替换成相应的字符
    for p_word in positive_emoticons:
        if p_word in sentence:
            sentence = sentence.replace(p_word, " 积极 ")
    for p_word in negative_emoticons:
        if p_word in sentence:
            sentence = sentence.replace(p_word, " 消极 ")
    # print(sentence)
    # 识别其中的标点符号,替换成空格
    for p_word in punctuation_words:
        if p_word in sentence:
            sentence = sentence.replace(p_word, " ")
    # print(sentence)
    # 对句子进行分词,去除停用词,识别情感词,并为情感词打上特殊标记
    line = sentence.replace("蒙牛", "")
    seg_line = jieba.cut(line)
    # print(seg_line)
    add_str = ""
    for word in seg_line:
        if word not in stop_words and word != "" and " " not in word:
            if word in positive_words:
                add_str += "# " + word + " # "
            elif word in negative_words:
                add_str += "* " + word + " * "
            else:
                add_str += word + " "
    # print(add_str.strip())
    label = classifier.predict([add_str.strip()], k=3)
    result = [(str(i), j) for i, j in zip(label[0], label[1])]
    print(result)
    return result
Beispiel #29
0
def get_predict(attrs, model_path, string):
    # str_list = ['我 要 一款 内存 大 运行 快 的 苹果 电脑']
    str_list = []
    result = ''
    for j in jieba.lcut(string):
        result = result + j + ' '
    result = result.strip()
    str_list.append(result)
    print(str_list)

    attrs_predict = []
    for attr in attrs:
        print(attr, '预测:')
        classifier = ff.load_model(model_path + attr + "_model")
        lab = classifier.predict(str_list, k=1)
        # print(lab)     # (['__label__2'], array([0.95485353]))
        # print(lab[0][0].split('__')[2])
        attrs_predict.append(lab[0][0].split('__')[2])
    print(attrs_predict)
    return attrs_predict
Beispiel #30
0
    def train_words_model(self,
                          corpus_filename,
                          model_filename,
                          model='skpigram',
                          min_count=5):

        corpus_filename = self.data_dir + corpus_filename
        model_filename = self.data_dir + model_filename
        print('Training for [%s] Model=%s Dim=%d MinCount=%d...' %
              (corpus_filename, model, self.word_dim, min_count))

        self.model = fasttext.train_unsupervised(input=corpus_filename,
                                                 model=model,
                                                 dim=self.word_dim,
                                                 minCount=min_count)
        self.model.save_model(model_filename)
        self.words_list = list(self.model.get_words())

        print('Finished. Dictionary size:%s' %
              '{:,}'.format(len(self.model.get_words())))