Example #1
0

def non_shuffling_train_test_split(X, y, test_size=0.2):
    i = int((1 - test_size) * X.shape[0]) + 1
    X_train, X_test = np.split(X, [i])
    y_train, y_test = np.split(y, [i])
    return X_train, X_test, y_train, y_test


print("> Read train data")
train_data = read_csv('train.csv')
print("> Init classifier")
start_time = time.time()
classifier = NaiveBayesTextClassifier(
    categories=['1', '0', 't'],
    min_df=1,
    lowercase=True,
    # 127 English stop words
    stop_words=stopwords.words('english'))

print("> Split data to test and train")
train_docs, test_docs, train_classes, test_classes = non_shuffling_train_test_split(
    train_data['tweet'], train_data['label'])
train_docs = train_docs.fillna('1')
train_classes = train_classes.fillna('1')
print(train_docs.isnull().any())
print(train_classes.isnull().any())
print(type(train_docs))
print("> Train classifier")
classifier.train(train_docs, train_classes)
total_docs = len(train_docs)
print("-" * 42)