def create_dataset_header(): """ Creates the dataset header. :return: the header :rtype: Instances """ att_msg = Attribute.create_string("Message") att_cls = Attribute.create_nominal("Class", ["miss", "hit"]) result = Instances.create_instances("MessageClassificationProblem", [att_msg, att_cls], 0) return result
def create_dataset(tweets): text_att = Attribute.create_string('TEXT') nom_att = Attribute.create_nominal('CLASS', class_values) dataset = Instances.create_instances("tweets", [text_att, nom_att], len(tweets)) for tweet in tweets: values = [] values.append(dataset.attribute(0).add_string_value(tweet)) values.append(Instance.missing_value()) inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() return dataset
train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff") train_data.class_is_last() string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector") cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial") fc = FilteredClassifier() fc.filter = string_to_word_vector_filter fc.classifier = cls fc.build_classifier(train_data) # Create test data class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"]) str_att = Attribute.create_string("title") test_dataset = Instances.create_instances( name="test_news_set", atts=[str_att, class_att], capacity=1 ) inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()]) test_dataset.add_instance(inst) test_dataset.get_instance(0).set_string_value(0, article['processed']['title']) test_dataset.class_is_last() # Run classifier article_instance = test_dataset.get_instance(0)