Example #1
0
print('> Read files...')
categories = os.listdir(data_dir)
print('> Split data to test and train')
documents, classes = get_texts(categories)
train_docs, test_docs, train_classes, test_classes = train_test_split(
    documents, classes, train_size=0.7)

classifier = NaiveBayesTextClassifier(
    categories=categories,
    min_df=1,
    lowercase=True,
    stop_words=stopwords.words('english')
)

print('> Train classifier')
classifier.train(train_docs, train_classes)

print('> Classify test data...')
predicted_classes = classifier.classify(test_docs)

print('> Complete.')
print(classification_report(test_classes, predicted_classes))

print('-' * 42)
print("{:<25}: {:>4} articles".format("Test data size", len(test_classes)))
print("{:<25}: {:>6.2f} %".format(
    "Accuracy", 100 * accuracy_score(test_classes, predicted_classes))
)
print("{:<25}: {:>6.2f} %".format(
    "Kappa statistics", 100 * kappa(
        category_to_number(test_classes, categories),
Example #2
0
    categories=['1', '0', 't'],
    min_df=1,
    lowercase=True,
    # 127 English stop words
    stop_words=stopwords.words('english'))

print("> Split data to test and train")
train_docs, test_docs, train_classes, test_classes = non_shuffling_train_test_split(
    train_data['tweet'], train_data['label'])
train_docs = train_docs.fillna('1')
train_classes = train_classes.fillna('1')
print(train_docs.isnull().any())
print(train_classes.isnull().any())
print(type(train_docs))
print("> Train classifier")
classifier.train(train_docs, train_classes)
total_docs = len(train_docs)
print("-" * 42)
print("Total", total_docs, " tweets")
print("Number of words", classifier.bag.shape[1], " words")
print("Parse time", time.time() - start_time, "seconds")
print("-" * 42)

# -------------- Classify --------------- #

print("> Start classify data")
start_time = time.time()
test_docs = test_docs.fillna('1')
test_classes = test_classes.fillna('1')
predicted_classes = classifier.classify(test_docs)
print((predicted_classes), (test_classes))
Example #3
0
    for category in categories:
        category_files_path = os.path.join(data_dir, category)
        text_ids = os.listdir(category_files_path)
        prepare_category_file = functools.partial(prepare_file, category_files_path)
        texts = [prepare_category_file(f) for f in text_ids]
        documents += texts
        classes += [category] * len(texts)

    return documents, classes

print('Get Gategories...')
categories = os.listdir(data_dir)
print('Reading Data...')
documents, classes = get_texts(categories)

train_docs, test_docs, train_classes, test_classes = train_test_split(documents, classes, train_size=0.9)

clf = NaiveBayesTextClassifier(categories=categories, min_df=1, lowercase=True, stop_words=stopwords.words('english'))

print('Training...')
clf.train(train_docs, train_classes)

print('Predicting...')
predicted_classes = clf.classify(test_docs)

print('Result:')
print('-' * 72)
print(classification_report(test_classes, predicted_classes))
print('-' * 72)

Example #4
0
classifier = NaiveBayesTextClassifier(
    categories=[0, 1],
    min_df=1,
    lowercase=True,
    # 127 English stop words
    stop_words=stopwords.words('english')
)

if options.test:
    print("> Split data to test and train")
    train_docs, test_docs, train_classes, test_classes = train_test_split(
        train_data.review, train_data.sentiment, train_size=0.7
    )

    print("> Train classifier")
    classifier.train(train_docs, train_classes)
    total_docs = len(train_docs)

elif options.predict:
    print("> Read test data")
    test_data = read_csv(TEST_DATA_FILE, sep='\t')

    print("> Train classifier")
    classifier.train(train_data.review, train_data.sentiment)
    total_docs = len(train_data)

print("-" * 42)
print("{:<25}: {:>6} articles".format("Total", total_docs))
print("{:<25}: {:>6} words".format(
    "Number of words", classifier.bag.shape[1]
))
Example #5
0
print("> Init classifier")
start_time = time.time()
classifier = NaiveBayesTextClassifier(
    categories=[0, 1],
    min_df=1,
    lowercase=True,
    # 127 English stop words
    stop_words=stopwords.words('english'))

if options.test:
    print("> Split data to test and train")
    train_docs, test_docs, train_classes, test_classes = train_test_split(
        train_data.review, train_data.sentiment, train_size=0.7)

    print("> Train classifier")
    classifier.train(train_docs, train_classes)
    total_docs = len(train_docs)

elif options.predict:
    print("> Read test data")
    test_data = read_csv(TEST_DATA_FILE, sep='\t')

    print("> Train classifier")
    classifier.train(train_data.review, train_data.sentiment)
    total_docs = len(train_data)

print("-" * 42)
print("{:<25}: {:>6} articles".format("Total", total_docs))
print("{:<25}: {:>6} words".format("Number of words", classifier.bag.shape[1]))
print("{:<25}: {:>6.2f} seconds".format("Parse time",
                                        time.time() - start_time))