Example #1
0
def main():
    data = lib.read_csv('data/labeled-data-singlelabels.csv')
    train_tweets, test_tweets = lib.split_data(data)

    lib.write_csv(train_tweets, 'data/labeled-data-singlelabels-train.csv')
    lib.write_csv(test_tweets, 'data/labeled-data-singlelabels-test.csv')

    train_tweets2, test_tweets2 = lib.read_data()

    assert (len(train_tweets) == len(train_tweets2))
    assert (len(test_tweets) == len(test_tweets2))
Example #2
0
from sklearn.neural_network import MLPClassifier

from lib import construct_data_matrix, split_data
from lib.data_paths import *
from scorer_semeval18 import main

tokenized_tweets = pickle.load(open(TOK_TWEETS_PATH, 'rb'))
print('loaded tweets')

data_matrix = construct_data_matrix(tokenized_tweets)
print('constructed data matrix')
print('Dim:', data_matrix.shape)
print('Density:', np.count_nonzero(data_matrix) / np.size(data_matrix))

labels = np.asarray(open(CLEAN_LABELS_PATH).read().splitlines())
data_train, data_test, labels_train, labels_test = split_data(
    data_matrix, labels)
print('split data')

clf = MLPClassifier(max_iter=200, verbose=True, alpha=0.001)
clf.fit(data_matrix, labels)

score = clf.predict(data_matrix)

f = open('english.output.txt', 'w')
for s in score:
    f.write(s + '\n')
f.close()

main(labels, score)
Example #3
0
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV

from lib import split_data
from preprocess import train_X, train_y
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

# 获取验证集
train_X, validate_X, train_y, validate_y = split_data(train_X, train_y)

# 随机森林求解
# maxN = [0, 0]
# for i in np.arange(2, 150):
#     y = 0
#     for j in range(3):
#         RFC = RandomForestClassifier(n_estimators=147, min_samples_split=4, min_samples_leaf=6, warm_start=True,
#                                      bootstrap=False)
#         RFC.fit(train_X, train_y)
#         y += RFC.score(validate_X, validate_y)
#     x = y / 3
#     if x > maxN[1]:
#         maxN[1] = x
#         maxN[0] = i
# print(maxN)

# rnd_for_model = RandomForestClassifier()
# grid_param = {"n_estimators": [100, 200, 500],