Example #1
0
    return documents, classes

def category_to_number(classes, category_type):
    return list(map(category_type.index, classes))

print('> Read files...')
categories = os.listdir(data_dir)
print('> Split data to test and train')
documents, classes = get_texts(categories)
train_docs, test_docs, train_classes, test_classes = train_test_split(
    documents, classes, train_size=0.7)

classifier = NaiveBayesTextClassifier(
    categories=categories,
    min_df=1,
    lowercase=True,
    stop_words=stopwords.words('english')
)

print('> Train classifier')
classifier.train(train_docs, train_classes)

print('> Classify test data...')
predicted_classes = classifier.classify(test_docs)

print('> Complete.')
print(classification_report(test_classes, predicted_classes))

print('-' * 42)
print("{:<25}: {:>4} articles".format("Test data size", len(test_classes)))
print("{:<25}: {:>6.2f} %".format(
Example #2
0
important_files = (TEST_DATA_FILE, LABELED_TRAIN_DATA_FILE)
for tsv_file in important_files:
    if not os.path.exists(tsv_file):
        raise EnvironmentError("File {} doesn't exist at {}.".format(
            ntpath.basename(tsv_file), DATADIR
        ))

print("> Read train data")
train_data = read_csv(LABELED_TRAIN_DATA_FILE, sep='\t')

print("> Init classifier")
start_time = time.time()
classifier = NaiveBayesTextClassifier(
    categories=[0, 1],
    min_df=1,
    lowercase=True,
    # 127 English stop words
    stop_words=stopwords.words('english')
)

if options.test:
    print("> Split data to test and train")
    train_docs, test_docs, train_classes, test_classes = train_test_split(
        train_data.review, train_data.sentiment, train_size=0.7
    )

    print("> Train classifier")
    classifier.train(train_docs, train_classes)
    total_docs = len(train_docs)

elif options.predict:
Example #3
0

def non_shuffling_train_test_split(X, y, test_size=0.2):
    i = int((1 - test_size) * X.shape[0]) + 1
    X_train, X_test = np.split(X, [i])
    y_train, y_test = np.split(y, [i])
    return X_train, X_test, y_train, y_test


print("> Read train data")
train_data = read_csv('train.csv')
print("> Init classifier")
start_time = time.time()
classifier = NaiveBayesTextClassifier(
    categories=['1', '0', 't'],
    min_df=1,
    lowercase=True,
    # 127 English stop words
    stop_words=stopwords.words('english'))

print("> Split data to test and train")
train_docs, test_docs, train_classes, test_classes = non_shuffling_train_test_split(
    train_data['tweet'], train_data['label'])
train_docs = train_docs.fillna('1')
train_classes = train_classes.fillna('1')
print(train_docs.isnull().any())
print(train_classes.isnull().any())
print(type(train_docs))
print("> Train classifier")
classifier.train(train_docs, train_classes)
total_docs = len(train_docs)
print("-" * 42)
Example #4
0
        "and put it in {}.".format(DATADIR))

important_files = (TEST_DATA_FILE, LABELED_TRAIN_DATA_FILE)
for tsv_file in important_files:
    if not os.path.exists(tsv_file):
        raise EnvironmentError("File {} doesn't exist at {}.".format(
            ntpath.basename(tsv_file), DATADIR))

print("> Read train data")
train_data = read_csv(LABELED_TRAIN_DATA_FILE, sep='\t')

print("> Init classifier")
start_time = time.time()
classifier = NaiveBayesTextClassifier(
    categories=[0, 1],
    min_df=1,
    lowercase=True,
    # 127 English stop words
    stop_words=stopwords.words('english'))

if options.test:
    print("> Split data to test and train")
    train_docs, test_docs, train_classes, test_classes = train_test_split(
        train_data.review, train_data.sentiment, train_size=0.7)

    print("> Train classifier")
    classifier.train(train_docs, train_classes)
    total_docs = len(train_docs)

elif options.predict:
    print("> Read test data")
    test_data = read_csv(TEST_DATA_FILE, sep='\t')