def create_dataset_header():
    """
    Creates the dataset header.
    :return: the header
    :rtype: Instances
    """
    att_msg = Attribute.create_string("Message")
    att_cls = Attribute.create_nominal("Class", ["miss", "hit"])
    result = Instances.create_instances("MessageClassificationProblem", [att_msg, att_cls], 0)
    return result
Example #2
0
def create_dataset(tweets):
    text_att = Attribute.create_string('TEXT')
    nom_att = Attribute.create_nominal('CLASS', class_values)
    dataset = Instances.create_instances("tweets", [text_att, nom_att],
                                         len(tweets))

    for tweet in tweets:
        values = []
        values.append(dataset.attribute(0).add_string_value(tweet))
        values.append(Instance.missing_value())
        inst = Instance.create_instance(values)
        dataset.add_instance(inst)

    dataset.class_is_last()

    return dataset
train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff")
train_data.class_is_last()

string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial")

fc = FilteredClassifier()
fc.filter = string_to_word_vector_filter
fc.classifier = cls

fc.build_classifier(train_data)

# Create test data

class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"])
str_att = Attribute.create_string("title")

test_dataset = Instances.create_instances(
    name="test_news_set",
    atts=[str_att, class_att],
    capacity=1
)

inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()])
test_dataset.add_instance(inst)
test_dataset.get_instance(0).set_string_value(0, article['processed']['title'])
test_dataset.class_is_last()

# Run classifier

article_instance = test_dataset.get_instance(0)