def test_pickle_unfitted(): ftdf = pd.DataFrame(data=[['woof woof', 0], ['meow meow', 1]], columns=['txt', 'lbl']) ft_clf = FirstColFtClassifier() pic_fpath = os.path.expanduser('~/.temp/ttemp_ft_model.ft') with open(pic_fpath, 'wb+') as bfile: pickle.dump(ft_clf, bfile) with open(pic_fpath, 'rb') as bfile: ft_clf2 = pickle.load(bfile) with pytest.raises(NotFittedError): assert ft_clf.predict([['woof woof']])[0] == 0 ft_clf.fit(ftdf[['txt']], ftdf['lbl']) assert ft_clf.predict([['woof woof']])[0] == 0 assert ft_clf.predict([['meow meow']])[0] == 1 assert ft_clf.predict([['meow']])[0] == 1 assert ft_clf.predict([['woof lol']])[0] == 0 assert ft_clf.predict([['meow lolz']])[0] == 1 assert ft_clf2 != ft_clf with pytest.raises(NotFittedError): assert ft_clf2.predict([['woof woof']])[0] == 0 ft_clf2.fit(ftdf[['txt']], ftdf['lbl']) assert ft_clf2.predict([['woof woof']])[0] == 0 assert ft_clf2.predict([['meow meow']])[0] == 1 assert ft_clf2.predict([['meow']])[0] == 1 assert ft_clf2.predict([['woof lol']])[0] == 0 assert ft_clf2.predict([['meow lolz']])[0] == 1
def test_pickle(): ftdf = pd.DataFrame(data=[['woof woof', 0], ['meow meow', 1]], columns=['txt', 'lbl']) ft_clf = FirstColFtClassifier() ft_clf.fit(ftdf[['txt']], ftdf['lbl']) assert ft_clf.predict([['woof woof']])[0] == 0 assert ft_clf.predict([['meow meow']])[0] == 1 assert ft_clf.predict([['meow']])[0] == 1 assert ft_clf.predict([['woof lol']])[0] == 0 assert ft_clf.predict([['meow lolz']])[0] == 1 fd, pic_fpath = tempfile.mkstemp() with open(pic_fpath, 'wb+') as bfile: pickle.dump(ft_clf, bfile) with open(pic_fpath, 'rb') as bfile: ft_clf2 = pickle.load(bfile) assert ft_clf2 != ft_clf assert ft_clf2.predict([['woof woof']])[0] == 0 assert ft_clf2.predict([['meow meow']])[0] == 1 assert ft_clf2.predict([['meow']])[0] == 1 assert ft_clf2.predict([['woof lol']])[0] == 0 assert ft_clf2.predict([['meow lolz']])[0] == 1 # Clean up os.close(fd) # Prevent a file-handle leak os.unlink(pic_fpath)
def test_predict_proba(): ftdf = _ftdf() ft_clf = FirstColFtClassifier() ft_clf.fit(ftdf[['txt']], ftdf['lbl']) res = ft_clf.predict_proba([['woof woof']])[0] assert res[0] > res[1] res = ft_clf.predict_proba([['meow meow']])[0] assert res[1] > res[0]
def test_predict(): ftdf = _ftdf() ft_clf = FirstColFtClassifier() ft_clf.fit(ftdf[['txt']], ftdf['lbl']) assert ft_clf.predict([['woof woof']])[0] == 0 assert ft_clf.predict([['meow meow']])[0] == 1 assert ft_clf.predict([['meow']])[0] == 1 assert ft_clf.predict([['woof lol']])[0] == 0 assert ft_clf.predict([['meow lolz']])[0] == 1
def test_cross_val(): ft_clf = ColLblBasedFtClassifier('txt', epoch=3) ftdf = _big_ftdf() cross_val_score( ft_clf, X=ftdf[['txt']], y=ftdf['lbl'], cv=2, scoring='accuracy') ft_clf = IdxBasedFtClassifier(0, epoch=3) ftdf = _big_ftdf() cross_val_score( ft_clf, X=ftdf[['txt']], y=ftdf['lbl'], cv=2, scoring='accuracy') ft_clf = FirstColFtClassifier(epoch=3) ftdf = _big_ftdf() cross_val_score( ft_clf, X=ftdf[['txt']], y=ftdf['lbl'], cv=2, scoring='accuracy')
def test_pickle(quantize): ftdf = pd.DataFrame(data=[['woof woof', 0], ['meow meow', 1]], columns=['txt', 'lbl']) ft_clf = FirstColFtClassifier() ft_clf.fit(ftdf[['txt']], ftdf['lbl']) if quantize: with pytest.raises(ValueError): ft_clf.quantize(cutoff=1) assert not ft_clf.is_quantized() return assert ft_clf.predict([['woof woof']])[0] == 0 assert ft_clf.predict([['meow meow']])[0] == 1 assert ft_clf.predict([['meow']])[0] == 1 assert ft_clf.predict([['woof lol']])[0] == 0 assert ft_clf.predict([['meow lolz']])[0] == 1 fd, pic_fpath = tempfile.mkstemp() with open(pic_fpath, 'wb+') as bfile: pickle.dump(ft_clf, bfile) with open(pic_fpath, 'rb') as bfile: ft_clf2 = pickle.load(bfile) assert ft_clf2 != ft_clf assert ft_clf2.predict([['woof woof']])[0] == 0 assert ft_clf2.predict([['meow meow']])[0] == 1 assert ft_clf2.predict([['meow']])[0] == 1 assert ft_clf2.predict([['woof lol']])[0] == 0 assert ft_clf2.predict([['meow lolz']])[0] == 1 if quantize: assert not ft_clf2.is_quantized() # Clean up os.close(fd) # Prevent a file-handle leak os.unlink(pic_fpath)
def test_bad_shape(): ft_clf = FirstColFtClassifier() with pytest.raises(ValueError): ft_clf.fit([7], [0]) with pytest.raises(ValueError): ft_clf.fit([[7]], [[0]])
train_data_format = np.asarray([content_train]).T logger.info("complete formate train data") columns = train_data_df.columns.values.tolist() # model train logger.info("start train model") classifier_dict = dict() for column in columns[2:]: train_label = train_data_df[column] logger.info("start train %s model" % column) sk_clf = FirstColFtClassifier(lr=learning_rate, epoch=epoch, wordNgrams=word_ngrams, minCount=min_count, verbose=2) sk_clf.fit(train_data_format, train_label) logger.info("complete train %s model" % column) classifier_dict[column] = sk_clf logger.info("complete train model") logger.info("start save model") model_path = config.model_path if not os.path.exists(model_path): os.makedirs(model_path) joblib.dump(classifier_dict, model_path + model_name)
def create_model(self): sk_clf = FirstColFtClassifier(lr=1.0, epoch=10, wordNgrams=1, minCount=5, verbose=2) return sk_clf
logger.info("start seg train data...") content_train = train_df.iloc[:, 1] content_train = data_util.seg_words(args, content_train) logger.info("prepare train format...") train_data_format = np.asarray([content_train]).T # array([[第三次 参加 大众],[同行 点 小吃 榴莲 酥],...]) columns = train_df.columns.values.tolist() logger.info("start train model...") classifier_dict = dict() for column in columns[2:]: # 标签 train_label = train_df[column] logger.info("start train %s model" % column) sk_clf = FirstColFtClassifier(lr=args.learning_rate, epoch=args.epoch, wordNgrams=args.word_ngrams, minCount=args.min_count, verbose=2) sk_clf.fit(train_data_format, train_label) logger.info("complete train %s model" % column) classifier_dict[column] = sk_clf logger.info("start save train model...") model_name = args.model_name joblib.dump(classifier_dict, model_name) logger.info("start seg valid data...") content_valid = valid_df.iloc[:, 1] content_valid = data_util.seg_words(args, content_valid) logger.info("prepare valid format") valid_data_format = np.asarray([content_valid]).T