Beispiel #1
0
def svm_train():
    # train_x, train_y,apps = get_data_set(train_path)
    # test_x, test_y,apps = get_data_set(test_path)
    # pred_x,_,apps=get_data_set(pred_path)
    train_x, train_y, test_x, test_y, pred_x, apps, label_dic = load_data()
    # with open(CHANNEL_MODEL + 'svm_label.pkl', 'wb') as f:
    #     pickle.dump(label_dic, f)

    logging.info('train {} test{}'.format(len(train_x), len(test_x)))
    t=time.time()
    logging.info("==="*8)

    estim = HyperoptEstimator(classifier=liblinear_svc('clf'),max_evals=10,
                              preprocessing=[
                                  tfidf('tfidf',ngram_range=(1, 4), min_df=10, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)],
                              algo=tpe.suggest, trial_timeout=1200,refit=False)
    logging.info(estim)
    estim.fit(train_x, train_y)
    best_model=estim.best_model()

    logging.info(best_model)
    learner=best_model['learner']
    preprocs=best_model['preprocs'][0]

    lin_clf = learner
    lin_clf = CalibratedClassifierCV(lin_clf)
    data_set=train_x+test_x+pred_x
    preprocs.fit_transform(data_set)
    trn_term_doc=preprocs.transform(train_x)
    lin_clf.fit(trn_term_doc, train_y)

    test_term_doc = preprocs.transform(test_x)
    test_preds_prob = lin_clf.predict_proba(test_term_doc)
    test_preds_=lin_clf.predict(test_term_doc)
    logging.info('accuracy_score {} top1 test\n {}'.format(accuracy_score(test_y, test_preds_),
                                                    classification_report(test_y,
                                                                          test_preds_)))
    test_preds=[]
    for prob in test_preds_prob:
        test_preds.append(list(prob.argsort()[-2:][::-1]))

    test_preds_ = []
    for rea, tes in zip(test_y, test_preds):
        prd = tes[0]
        for te in tes:
            if rea == te:
                prd = te
        test_preds_.append(prd)
    logging.info('accuracy_score {} top2 test\n {}'.format(accuracy_score(test_y, test_preds_),
                                                    classification_report(test_y,
                                                                          test_preds_)))
Beispiel #2
0
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

project_path="/data/tanggp/xun_class//aichallenge/"
test_path=os.path.join(project_path,"apptype_train.test_jieba_json")
train_path=os.path.join(project_path,"apptype_train.train_jieba_json")
pred_path=os.path.join(project_path,"app_desc.jieba_json")

label_num = 0
t = time.time()
SEED = 314159265
import json

from sklearn.calibration import CalibratedClassifierCV

train_x, train_y, test_x, test_y, pred_x, apps, label_dic = load_data()
logging.info('train {} test{}'.format(len(train_x), len(test_x)))
data_set = train_x + test_x + pred_x
def score(params):



    t1=time.time()
    logging.info(params)
    vec = TfidfVectorizer(ngram_range=(1,int(params["ngram_range"])), min_df=params["min_df"], max_df=params["max_df"], use_idf=1, smooth_idf=1, sublinear_tf=1)
    #vec=HashingVectorizer(ngram_range=(1, 3))

    vec.fit_transform(data_set)
    #
    # with open(project_path + 'tfidf.pkl', 'wb') as f:
    #     pickle.dump(vec, f)