def main(): data = lib.read_csv('data/labeled-data-singlelabels.csv') train_tweets, test_tweets = lib.split_data(data) lib.write_csv(train_tweets, 'data/labeled-data-singlelabels-train.csv') lib.write_csv(test_tweets, 'data/labeled-data-singlelabels-test.csv') train_tweets2, test_tweets2 = lib.read_data() assert (len(train_tweets) == len(train_tweets2)) assert (len(test_tweets) == len(test_tweets2))
from sklearn.neural_network import MLPClassifier from lib import construct_data_matrix, split_data from lib.data_paths import * from scorer_semeval18 import main tokenized_tweets = pickle.load(open(TOK_TWEETS_PATH, 'rb')) print('loaded tweets') data_matrix = construct_data_matrix(tokenized_tweets) print('constructed data matrix') print('Dim:', data_matrix.shape) print('Density:', np.count_nonzero(data_matrix) / np.size(data_matrix)) labels = np.asarray(open(CLEAN_LABELS_PATH).read().splitlines()) data_train, data_test, labels_train, labels_test = split_data( data_matrix, labels) print('split data') clf = MLPClassifier(max_iter=200, verbose=True, alpha=0.001) clf.fit(data_matrix, labels) score = clf.predict(data_matrix) f = open('english.output.txt', 'w') for s in score: f.write(s + '\n') f.close() main(labels, score)
from sklearn.linear_model import RidgeClassifier from sklearn.model_selection import GridSearchCV from lib import split_data from preprocess import train_X, train_y import numpy as np from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.svm import SVC from xgboost.sklearn import XGBClassifier # 获取验证集 train_X, validate_X, train_y, validate_y = split_data(train_X, train_y) # 随机森林求解 # maxN = [0, 0] # for i in np.arange(2, 150): # y = 0 # for j in range(3): # RFC = RandomForestClassifier(n_estimators=147, min_samples_split=4, min_samples_leaf=6, warm_start=True, # bootstrap=False) # RFC.fit(train_X, train_y) # y += RFC.score(validate_X, validate_y) # x = y / 3 # if x > maxN[1]: # maxN[1] = x # maxN[0] = i # print(maxN) # rnd_for_model = RandomForestClassifier() # grid_param = {"n_estimators": [100, 200, 500],