Example #1
0
def Xgboost():
    train_data, train_label = genKeyWords("../data/cnews.train.txt")
    vectors = CountVectorizer(max_df=0.2)
    tfidf = TfidfTransformer(use_idf=True)
    bst=XGBClassifier(n_jobs=10,max_depth=55,objective='multi:softmax',num_class=10,subsample=0.4,reg_lambda=0.8)
    pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("bst",bst)])
    pipline.fit(train_data,train_label)
    joblib.dump(pipline,"./model/XGB.m")
    test_data, test_label = genKeyWords("../data/cnews.test.txt")
    predicted = pipline.predict(test_data)
    print('Xgboost', np.mean(predicted == test_label))
Example #2
0
def svc():
    count=CountVectorizer(max_df=0.2,max_features=None)
    tfidf=TfidfTransformer(use_idf=False)
    _svc=SVC(C=0.99,kernel='linear')
    train_data, train_label=genKeyWords("../data/cnews.train.txt")
    test_data,test_label=genKeyWords("../data/cnews.test.txt")
    pipline=Pipeline([("count",count),("tfidf",tfidf),("svc",_svc)])

    pipline=pipline.fit(train_data,train_label)
    joblib.dump(pipline, "./SVM.m")
    predicted = pipline.predict(test_data)
    print('SVC', np.mean(predicted == test_label))
Example #3
0
def Bayes(mode='mul'):
    if mode=='mul':
        model=MultinomialNB()
    elif mode=='gau':
        model=GaussianNB()
    elif mode=='bern':
        model=BernoulliNB()
    else:
        raise ValueError('没有该模式,请填写以下mode,\n mul==>MultinomialNB \ngau==>GaussianNB \n bern==>BernoulliNB')
    train_data, train_label = genKeyWords("../data/cnews.train.txt")
    vectors = CountVectorizer()
    tfidf=TfidfTransformer()
    pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("bayes",model)])
    pipline.fit(train_data,train_label)
    test_data, test_label = genKeyWords("../data/cnews.test.txt")
    predicted = pipline.predict(test_data)
    joblib.dump(pipline, "./%s_bayes.m"%mode)
    print('naive_bayes', np.mean(predicted == test_label))
Example #4
0
def Knn():
    train_data, train_label=genKeyWords("../data/cnews.train.txt")
    vectors=CountVectorizer()
    tfidf=TfidfTransformer()
    clf=KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
    pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("clf",clf)])
    # parameters={'clf__n_neighbors':list(range(5,20,2))}
    # grid_search = GridSearchCV(pipline, parameters, n_jobs=-1, verbose=1)
    # grid_search.fit(train_data,train_label)
    # best_parameters = grid_search.best_estimator_.get_params()
    # for param_name in sorted(parameters.keys()):
    #     print("\t%s: %r" % (param_name, best_parameters[param_name]))

    pipline.fit(train_data,train_label)
    test_data,test_label=genKeyWords("../data/cnews.test.txt")
    predicted = pipline.predict(test_data)
    joblib.dump(pipline,"./knn.m")
    print('KNeighborsClassifier', np.mean(predicted == test_label))
Example #5
0
def DTrees():
    train_data, train_label = genKeyWords("../data/cnews.train.txt")
    vectors = CountVectorizer(max_df=0.6)
    tfidf = TfidfTransformer(use_idf=False)
    tree=DecisionTreeClassifier(criterion="entropy",max_depth=20)
    pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("tree",tree)])
    params={"tree__max_depth":list(range(75,105,5))}
    accuracy=make_scorer(accuracy_score)
    gridsearch=GridSearchCV(pipline,params,n_jobs=10,scoring=accuracy)
    gridsearch.fit(train_data,train_label)
    best_parameters = gridsearch.best_estimator_.get_params()
    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
Example #6
0
def K_Means(minibatch):
    train_data, train_label = genKeyWords("../data/cnews.train.txt")
    vectors = CountVectorizer()
    tfidf = TfidfTransformer()
    if minibatch:
        km = MiniBatchKMeans(n_clusters=10, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=False)
    else:
        km = KMeans(n_clusters=10, init='k-means++', max_iter=300, n_init=1,
            verbose=False)

    pipline=Pipeline([("vectors",vectors),("tfidf",tfidf),("kmeans",km)])
    pipline.fit(train_data)