def svm_train(): # train_x, train_y,apps = get_data_set(train_path) # test_x, test_y,apps = get_data_set(test_path) # pred_x,_,apps=get_data_set(pred_path) train_x, train_y, test_x, test_y, pred_x, apps, label_dic = load_data() # with open(CHANNEL_MODEL + 'svm_label.pkl', 'wb') as f: # pickle.dump(label_dic, f) logging.info('train {} test{}'.format(len(train_x), len(test_x))) t=time.time() logging.info("==="*8) estim = HyperoptEstimator(classifier=liblinear_svc('clf'),max_evals=10, preprocessing=[ tfidf('tfidf',ngram_range=(1, 4), min_df=10, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)], algo=tpe.suggest, trial_timeout=1200,refit=False) logging.info(estim) estim.fit(train_x, train_y) best_model=estim.best_model() logging.info(best_model) learner=best_model['learner'] preprocs=best_model['preprocs'][0] lin_clf = learner lin_clf = CalibratedClassifierCV(lin_clf) data_set=train_x+test_x+pred_x preprocs.fit_transform(data_set) trn_term_doc=preprocs.transform(train_x) lin_clf.fit(trn_term_doc, train_y) test_term_doc = preprocs.transform(test_x) test_preds_prob = lin_clf.predict_proba(test_term_doc) test_preds_=lin_clf.predict(test_term_doc) logging.info('accuracy_score {} top1 test\n {}'.format(accuracy_score(test_y, test_preds_), classification_report(test_y, test_preds_))) test_preds=[] for prob in test_preds_prob: test_preds.append(list(prob.argsort()[-2:][::-1])) test_preds_ = [] for rea, tes in zip(test_y, test_preds): prd = tes[0] for te in tes: if rea == te: prd = te test_preds_.append(prd) logging.info('accuracy_score {} top2 test\n {}'.format(accuracy_score(test_y, test_preds_), classification_report(test_y, test_preds_)))
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') project_path="/data/tanggp/xun_class//aichallenge/" test_path=os.path.join(project_path,"apptype_train.test_jieba_json") train_path=os.path.join(project_path,"apptype_train.train_jieba_json") pred_path=os.path.join(project_path,"app_desc.jieba_json") label_num = 0 t = time.time() SEED = 314159265 import json from sklearn.calibration import CalibratedClassifierCV train_x, train_y, test_x, test_y, pred_x, apps, label_dic = load_data() logging.info('train {} test{}'.format(len(train_x), len(test_x))) data_set = train_x + test_x + pred_x def score(params): t1=time.time() logging.info(params) vec = TfidfVectorizer(ngram_range=(1,int(params["ngram_range"])), min_df=params["min_df"], max_df=params["max_df"], use_idf=1, smooth_idf=1, sublinear_tf=1) #vec=HashingVectorizer(ngram_range=(1, 3)) vec.fit_transform(data_set) # # with open(project_path + 'tfidf.pkl', 'wb') as f: # pickle.dump(vec, f)