Ejemplo n.º 1
0
def f_smote():
    jvm.start()

    train_data, test_data = b_i_impute_data()

    train_data = train_data[:10000]
    y_train = train_data["class"]
    x_train = train_data.drop("class", axis=1)

    sm = SMOTE(ratio="minority")
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns)
    y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"])
    train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1)
    print_f("smote train data shape", train_data_sm_df.shape)
    train_data_sm_df.to_csv("./train_data_sm.csv", index=False)

    train_data_sm = converters.load_any_file("train_data_sm.csv")
    train_data_sm.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print_f("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print_f("bulding classifier")
    cls.build_classifier(train_data_sm)
    print_f("Evaluating")
    evl = Evaluation(train_data_sm)

    evl.crossvalidate_model(cls, train_data_sm, 5, Random(1))
    print_f("Train Accuracy:", evl.percent_correct)
    print_f("Train summary")
    print_f(evl.summary())
    print_f("Train class details")
    print_f(evl.class_details())
    print_f("Train confusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl,
                   class_index=[0, 1],
                   wait=True,
                   outfile="./plots/2_f_smote_10k.png")
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)

    evl = Evaluation(test_data)
    print_f("testing model")
    evl.test_model(cls, test_data)
    print_f("Test Accuracy:", evl.percent_correct)
    print_f("Test summary")
    print_f(evl.summary())
    print_f(" Testclass details")
    print_f(evl.class_details())
    print_f("Testconfusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/f_test_roc_curve.png")
Ejemplo n.º 2
0
def ClassifyParam(mode, binWidths):
	if not os.path.exists("classificationResults"):
		os.makedirs("classificationResults")

	if("normal" in mode):
		file = open("classificationResults/AllVsAll.csv","w") 

		file.write("BinWidth, Accuracy\n")

		for binWidth in binWidths:

			train_set = "Data/arff/TrainSet_%s.arff"%(binWidth)
			test_set = "Data/arff/TestSet_%s.arff"%(binWidth)
			print "Loading Datasets..."

			train_data = converters.load_any_file(train_set)
			test_data = converters.load_any_file(test_set)
			#Set class attribute
			train_data.class_is_last()
			test_data.class_is_last()
			print "Dataset Loaded!"


			classifier_name = "weka.classifiers.meta.FilteredClassifier"

			classifier = Classifier(classname=classifier_name, options=[
				"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
				"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])


			start_train = time.time()
			classifier.build_classifier(train_data)
			end_train = time.time()
			print "Train\t%s\t%s"%(binWidth, end_train-start_train)

			for index, inst in enumerate(test_data):
				if(index == 0):
					start_sample = time.time()
					classifier.classify_instance(inst)
					end_sample = time.time()
					print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)

			print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
			evaluation = Evaluation(test_data)
			start_batch = time.time()
			evaluation.test_model(classifier, test_data)
			end_batch = time.time()
			print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)

			
			print evaluation.summary()
			acc = evaluation.percent_correct/100.0
			print "Percent correct: " + str(acc)

			file.write("%s, %s\n"%(binWidth, acc))
		file.close()
Ejemplo n.º 3
0
def command():
    jvm.start()

    import weka.core.converters as converters
    clusters = request.form['clusternum']
    a1 = request.form['firstcol']
    a2 = request.form['secondcol']
    # print clusters
    # print a1
    # print a2
    if (a1 == 'B' and a2 == 'C'):
        data = converters.load_any_file("Data.csv")
    elif (a1 == 'B' and a2 == 'D'):
        data = converters.load_any_file("Data1.csv")
    elif (a1 == 'C' and a2 == 'D'):
        data = converters.load_any_file("Data2.csv")
    elif (a1 == 'C' and a2 == 'E'):
        data = converters.load_any_file("Data3.csv")
    elif (a1 == 'D' and a2 == 'E'):
        data = converters.load_any_file("Data4.csv")

    #data.class_is_last()

    print(data)

    # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"])
    # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"])
    # attsel = AttributeSelection()
    # attsel.search(search)
    # attsel.evaluator(evaluator)
    # attsel.select_attributes(data)
    f = open("filename.txt", "w")
    from weka.clusterers import Clusterer
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", "{}".format(clusters)])
    clusterer.build_clusterer(data)

    print(clusterer)
    f.write(str(clusterer))
    # cluster the data
    for inst in data:
        cl = clusterer.cluster_instance(inst)  # 0-based cluster index
        dist = clusterer.distribution_for_instance(
            inst)  # cluster membership distribution
        print("cluster=" + str(cl) + ", distribution=" + str(dist))
        f.write("cluster=" + str(cl) + ", distribution=" + str(dist))

    return render_template("output.html")
    f.close()
Ejemplo n.º 4
0
def train(objs, paras, outfiles):
  outfile = preprocess(outfiles)
  print 'train', objs, paras, outfile
  data = converters.load_any_file(outfile)
  preds = {}
  reals = {}
  for obj in objs:
    preds[obj] = []
    reals[obj] = []
  label = []
  testidxes = []
  for idx, ins in enumerate(data):
    label.append(random.randint(0, 9))
  for i in range(10):
    trainfile, testfile, testidx = split(data, istest_10fold, label, i)
    for obj in objs:
      traindata = cleanup(trainfile, paras, obj)
      testdata = cleanup(testfile, paras, obj)
      pred, real = eval_one_split(traindata, testdata, obj)
      preds[obj].extend(pred)
      reals[obj].extend(real)
    testidxes.extend(testidx)
    subprocess.call('rm %s %s' % (trainfile, testfile), shell=True)
  subprocess.call('rm %s' % outfile, shell=True)
  print 'num ins', data.num_instances
  for obj in objs:
    print obj, metric(preds[obj], reals[obj])
  return data, preds, reals, testidxes
Ejemplo n.º 5
0
def experiment_file_random(path_features, path_folder_save_results, options,
                           classifier, fold, random, name):
    print("start weka")
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))
    d_results = {
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }
    data = converters.load_any_file(path_features)
    data.class_is_last()
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, fold, Random(random), pout)
    d_results['percent_correct'].append(evl.percent_correct)
    d_results['percent_incorrect'].append(evl.percent_incorrect)
    d_results['confusion_matrix'].append(
        evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv',
                     index=False)

    save = pout.buffer_content()

    with open(
            path_folder_save_results + '/' + 'prediction/' + str(name) +
            '.csv', 'w') as f:
        f.write(save)
Ejemplo n.º 6
0
def predict(exp, arff_path, dst_folder):
    """The function to generate a detailed prediction sequence of the experiment.

    Args:
        exp(obj): An util.runtime.Observation object.
        arff_path(str): The string that represents the path of the input arff
            file.
        dst_folder(str): The path of the folder to put the result.

    Returns:
        None
    """
    global __predictors
    import util.runtime as runtime
    import weka.core.converters as converters

    data = converters.load_any_file(arff_path)
    data.class_is_last()
    for cls_name, cls in __predictors.items():
        f_path = os.path.join(dst_folder, cls_name + '.txt')
        with open(f_path, 'w') as f:
            lines = []
            for index, inst in enumerate(data):
                prediction = cls.classify_instance(inst)
                print("Predictions file:", f_path, "Prediction:", prediction,
                      "[", int(prediction), "]",
                      runtime.all_classes[int(prediction)])
                # print("runtime.all_classes:", runtime.all_classes)
                lines.append(runtime.all_classes[int(prediction)])
            f.writelines('\n'.join(lines))
Ejemplo n.º 7
0
def create_model(input_file, output_file):
    # Load data
    data = converters.load_any_file(input_file)
    data.class_is_last()  # set class attribute

    # filter data
    print_title("Filtering Data")
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10", "-M", "-1.0", "-R", "first-last"])
    discretize.inputformat(
        data)  # let the filter know about the type of data to filter
    filtered_data = discretize.filter(data)
    print("Done! (believe it or not)")

    print_title("Build Classifier")
    classifier = Classifier(classname="weka.classifiers.trees.RandomForest",
                            options=["-I", "100", "-K", "0", "-S", "1"])
    classifier.build_classifier(filtered_data)
    print("Done! (believe it or not)")
    serialization.write_all(output_file, [classifier, discretize])
    print("Model and filter saved to ", output_file)

    evaluation = Evaluation(data)  # initialize with priors
    evaluation.crossvalidate_model(classifier, filtered_data, 10,
                                   Random(42))  # 10-fold CV
    print(evaluation.summary())
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
Ejemplo n.º 8
0
def OnlineClassification():
	#Classifies instances in an online way.
	#TODO: This is just a simple example of how online learning can be automated w/ WEKA
	# May be useful to later stages of the project.

	data_dir = "Testbed/"

	training = converters.load_any_file(data_dir + "training_dataset.csv")
	training.class_is_last()

	testing = converters.load_any_file(data_dir + "testing_dataset.csv")
	testing.class_is_last()

	a = open(data_dir +"testing_dataset.csv", "r") 
	print len(a.readlines())


	cls_classes = ["weka.classifiers.trees.J48",
					"weka.classifiers.trees.RandomForest",
					"weka.classifiers.lazy.IBk"]

	classifiers = []
	for cls in cls_classes:
		classifiers.append(Classifier(classname=cls))


	#Set class attribute

	print colored("======================================================",'green')
	print colored("Experiment for dataset",'green')
	print colored("======================================================",'green')


	for i, cls in enumerate(classifiers):
		cls.build_classifier(training)

		print("# - actual - predicted - right - distribution")
		for index, inst in enumerate(testing):
			pred = cls.classify_instance(inst)
			dist = cls.distribution_for_instance(inst)
			print(
				"%d - %s - %s - %s  - %s" %
				(index+1,
					inst.get_string_value(inst.class_index),
					inst.class_attribute.value(int(pred)),
					"yes" if pred == inst.get_value(inst.class_index) else "no",
					str(dist.tolist())))
Ejemplo n.º 9
0
def e_model_tree():
    # train_data, test_data = b_i_impute_data()
    # train_data.to_csv("./train_data.csv", index=False)
    # test_data.to_csv("./test_data.csv",index=False)

    jvm.start()
    train_data = converters.load_any_file("train_data.csv")
    train_data.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print("2")
    cls.build_classifier(train_data)

    print("3")
    evl = Evaluation(train_data)
    evl.crossvalidate_model(cls, train_data, 5, Random(1))
    print("Train Accuracy:", evl.percent_correct)
    print("Train summary")
    print(evl.summary())
    print("Train class details")
    print(evl.class_details())
    print("Train confusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_train_roc_curve.png")

    evl = Evaluation(test_data)
    evl.test_model(cls, test_data)
    print("Test Accuracy:", evl.percent_correct)
    print("Test summary")
    print(evl.summary())
    print(" Testclass details")
    print(evl.class_details())
    print("Testconfusion matrix")
    print(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/e_test_roc_curve.png")
Ejemplo n.º 10
0
    def __init__(self):
        jvm.start()

        data_dir = "./DataSet/"
        self.data = converters.load_any_file(data_dir + "chatbot2.arff")
        self.data.class_is_last()

        self.cls = Classifier(classname="weka.classifiers.trees.J48")
        self.cls.build_classifier(self.data)

        self.intens = self.data.attribute_by_name("intent")
Ejemplo n.º 11
0
def cleanup(f, attrs, obj):
  data = converters.load_any_file(f)
  n = data.num_attributes
  for idx in range(n):
    data.delete_with_missing(idx)
  for idx in reversed(range(n)):
    if data.attribute(idx).name not in attrs and data.attribute(idx).name != obj:
      data.delete_attribute(idx)
  for idx in range(data.num_attributes):
    if data.attribute(idx).name == obj:
      data.class_index = idx
  return data
Ejemplo n.º 12
0
def classify(train, test, name="RF", tuning=False):
    jvm.start()

    if isinstance(train, list) and isinstance(test, list):
        train = weka_instance(train)
        trn_data = converters.load_any_file(train)
        test = weka_instance(test)
        tst_data = converters.load_any_file(test)

    elif os.path.isfile(train) and os.path.isfile(test):
        trn_data = converters.load_any_file(train)
        tst_data = converters.load_any_file(test)

    else:
        trn = csv_as_ndarray(train)
        tst = csv_as_ndarray(test)

        trn_data = converters.ndarray_to_instances(trn, relation="Train")
        tst_data = converters.ndarray_to_instances(tst, relation="Test")

    trn_data.class_is_last()
    tst_data.class_is_last()

    # t = time()
    if tuning:
        opt = tune(train)
    else:
        opt = default_opt
    # print("Time to tune: {} seconds".format(time() - t))

    cls = Classifier(classname=classifiers[name.lower()], options=opt)

    cls.build_classifier(trn_data)

    distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data]
    preds = [cls.classify_instance(inst) for inst in tst_data]

    jvm.stop()

    return preds, distr
Ejemplo n.º 13
0
def experiment_more_file(path_files, path_folder_save_results, fold, options,
                         classifier, random, name):
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    file_list = os.listdir(path_files)

    for file in file_list:
        if ".csv" not in file:
            file_list.remove(file)

    d_results = {
        'name_file': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }

    print(file_list)

    for file in file_list:
        print(str(file))
        data = converters.load_any_file(path_files + "/" + file)

        data.class_is_last()

        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")

        evl = Evaluation(data)

        evl.crossvalidate_model(cls, data, fold, Random(random), pout)

        d_results['name_file'].append(str(file))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

        save = pout.buffer_content()

        with open(
                path_folder_save_results + '/' + 'prediction/' + str(name) +
                str(file)[:-4] + 'pred_data.csv', 'w') as f:
            f.write(save)

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv",
                     index=False)
Ejemplo n.º 14
0
def obtainSVM(file):
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    data.class_is_last()

    classifier = Classifier(classname="weka.classifiers.functions.LibSVM")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
Ejemplo n.º 15
0
def associateRule(request):

    jvm.start()

    data_dir = os.path.dirname(os.path.abspath(__file__))
    data = converters.load_any_file(data_dir +
                                    "/templates/upload_files/export.csv")
    data.class_is_last()

    associator = Associator(classname="weka.associations.Apriori",
                            options=["-C", "-1", "-I"])
    # associator = Associator(classname="weka.associations.Apriori", options=["-N", "9", "-I"])
    associator.build_associations(data)

    rules = str(associator)

    jvm.stop()

    return HttpResponse(rules)
def all_feature(file):
    jvm.start(packages=True)
    data = converters.load_any_file(file)
    data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    attsel = AttributeSelection()
    attsel.search(search)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.ChiSquaredAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    chi = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    info_gain = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.GainRatioAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    gain_ratio = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.SymmetricalUncertAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    symmetric_uncertainty = t.astype(int)

    jvm.stop()

    return chi, info_gain, gain_ratio, symmetric_uncertainty
Ejemplo n.º 17
0
    def bayes_classifier(features):
        #carrega o dataset
        instancias = load_any_file("caracteristicas.arff")
        # sinaliza que o ultimo atributo é a classe
        instancias.class_is_last()
        # Carrega o classificafor Naive Bayes e Classifica com base nas características da imagem
        classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        classifier.build_classifier(instancias)
        # Cria uma nova instância com base nas caracteristicas extraidas
        new_instance = Instance.create_instance(features)
        # Adiciona a nova instância ao dataset
        instancias.add_instance(new_instance)
        # Liga a nova instancia ao dataset
        new_instance.dataset = instancias
        # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas
        classification = classifier.distribution_for_instance(new_instance)

        print("Classificação", " - Apu: ", round(classification[0] * 100, 2),
              "  Nelson: ", round(classification[1], 2))

        return classification
Ejemplo n.º 18
0
def convert_file(from_x, to_y):
    # Create nominals for emotion attr
    value_list = []
    for i in range(2):
        value_list.append(str(i))
    # Check if more nominals needed
    if not from_x.parent.name.endswith("happy_data"):
        for i in range(2, 7):
            value_list.append(str(i))

    if type(from_x) is not str:
        from_x = str(from_x)
    if type(to_y) is not str:
        to_y = str(to_y)

    # Loads data based on file type
    data = converters.load_any_file(from_x)
    # emotion attribute located at index 0
    emotion_atr = data.attribute(0)

    # need emotion attr to be nominal
    if not emotion_atr.is_nominal:
        # Modify emotion attr
        emotion_atr = emotion_atr.create_nominal(emotion_atr.name, value_list)

        # Store all emotion values before swapping
        # to modified emotion_atr
        emotion_vals = []
        for i in dataset.InstanceIterator(data):
            emotion_vals.append(int(i.get_value(0)))

        # Replace emotion attr
        data.delete_first_attribute()
        data.insert_attribute(emotion_atr, 0)

        # Set the values in new emotion attr
        for i in dataset.InstanceIterator(data):
            i.set_string_value(0, str(emotion_vals.pop(0)))

    converters.save_any_file(data, to_y)
Ejemplo n.º 19
0
def obtainBayesNet(file):
    #The path of the arff extension file must be put.
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")

    #In the case of this specific data set, the first two attributes were removed since they
    #   represent the name and ranking which are unique values that would affect the classification.
    #   Depending on the data set, certain attributes must be removed.
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    #It is specified that the class value is the last attribute.
    data.class_is_last()

    #Define the classifier to be used.
    classifier = Classifier(classname="weka.classifiers.bayes.BayesNet")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    #The ROC-AUC is extracted from the string that is received from Weka.
    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
Ejemplo n.º 20
0
 def loadData(self, fName, temp=True):
     if temp:
         data = converters.load_any_file(fName)
     else:
         data = converters.load_any_file(os.path.join(self.dataDir, fName))
     return data
Ejemplo n.º 21
0
from dataformatter import DataFormatter
import weka.core.packages as packages

dataDir = os.path.join(os.path.dirname(os.path.abspath('')), 'data')
modelDir = os.path.join(os.path.dirname(os.path.abspath('')), 'models')

dformat = DataFormatter(dataDir)

dformat.dict2arff(os.path.join(dataDir, 'System.csv'),
                  os.path.join(dataDir, 'System.arff'))

#Arff_file = os.path.join(dataDir, 'System.arff')

jvm.start(packages=True)

data = converters.load_any_file(os.path.join(dataDir, 'System.arff'))
clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                      options=["-N", "10", "-S", "10"])
clusterer.build_clusterer(data)

# print clusterer
# cluster the data
# for inst in data:
#     cl = clusterer.cluster_instance(inst)  # 0-based cluster index
#     dist = clusterer.distribution_for_instance(inst)   # cluster membership distribution
#     print("cluster=" + str(cl) + ", distribution=" + str(dist))
#     print inst

# serialization.write(os.path.join(modelDir, 'SKM.model'), clusterer)

clusterEM = Clusterer(classname="weka.clusterers.EM",
Ejemplo n.º 22
0
def convertArff2Csv(infile, outfile):
    jvm.start()
    data = converters.load_any_file(infile)
    converters.save_any_file(data, outfile)
    jvm.stop()
Ejemplo n.º 23
0
def ClassifyTestSet():
	#Tests a classifier performance with a dedicated test set
	# Models are evaluated for different combinations of features
	# Several classifiers may be used 

	# Load Datasets
	data_dir = "Testbed/"

	#h=open(data_dir+"training_dataset.csv","rb")

	#print h
	a = open(data_dir +"training_dataset.csv", "r") 
	print len(a.readlines())
	a = open(data_dir +"testing_dataset.csv", "r") 
	print len(a.readlines())

	training = converters.load_any_file(data_dir+"training_dataset.csv")
	training.class_is_last()

	testing = converters.load_any_file(data_dir +"testing_dataset.csv")
	testing.class_is_last() #set class attribute to be the last one listed


	#Choose classifiers to use
	cls_classes = ["weka.classifiers.trees.RandomForest",
					"weka.classifiers.trees.J48",
					"weka.classifiers.lazy.IBk"
					]

	classifiers = []
	for cls in cls_classes:
		classifiers.append(Classifier(classname=cls))

	
	#Regex for attribute selection
	#(Useful for testing different combinations of attributes)
	identifier_att = ".*id.*"
	timeseries_att = "Mic.*"
	doppler_att = "doppler.*"
	phase_att = "phase.*"
	music_att = "music.*"
	beamform_att = "beamform.*"
	att_set = [timeseries_att, doppler_att, phase_att, music_att, beamform_att]

	##################################################
	#Remove instances identifier attribute
	training = FilterAttribute(identifier_att,training)
	testing = FilterAttribute(identifier_att,testing)
	################################################


	for att_comb in powerset(att_set):
		training_filtered = training
		testing_filtered = testing

		for att in att_comb:
			if(len(att) != len(att_set)):
				training_filtered = FilterAttribute(att,training_filtered)
				testing_filtered  = FilterAttribute(att,testing_filtered)

		print colored("======================================================",'green')
		print colored("Full attribute set: " + str(att_set),'green')
		print colored("Removed attributes: " + str(att_comb),'green')
		print colored("======================================================",'green')


		for i, cls in enumerate(classifiers):
			cls.build_classifier(training_filtered)

			evl = Evaluation(training)
			evl.test_model(cls, testing_filtered)


			print colored("=> Testing for " + cls_classes[i], 'red')
			print(evl.summary())
			print(evl.matrix())
Ejemplo n.º 24
0
def experiment_sequential_file(path_indices, path_features,
                               path_folder_save_results, options, classifier,
                               name, indicator_col, images):
    ind_f = load(path_indices)
    lst = ind_f.files

    for item in lst:
        ind = ind_f[item] + 1

    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    data = converters.load_any_file(path_features)

    ind = np.append(ind, len(data))

    data.class_is_last()

    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")

    d_results = {
        'index': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'precision': [],
        'recall': [],
        'f-score': [],
        'confusion_matrix': []
    }

    for j in range(len(ind) - 1):
        first = ind[j]

        if j == len(ind) - 2:
            last = ind[j + 1]
        else:
            last = ind[j + 1] - 1

        d_test = data.subset(row_range=str(first) + '-' + str(last))

        if j == 0:  # first
            d_train = data.subset(row_range=str(last + 1) + '-' +
                                  str(ind[-1]))  # last element
            print(str(last + 1) + '-' + str(ind[-1]))
        elif j == len(ind) - 2:  # last
            d_train = data.subset(row_range='1-' +
                                  str(first - 1))  # last element
            print('1-' + str(first - 1))
        else:  # central
            s = '1-' + str(first - 1) + ',' + str(last + 1) + '-' + str(
                ind[-1])
            print(s)
            d_train = data.subset(row_range=s)

        cls.build_classifier(d_train)

        evl = Evaluation(data)
        evl.test_model(cls, d_test, pout)

        # print(type(d_train))
        # print(type(d_test))

        d_results['index'].append(str(ind[j]))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['precision'].append(evl.precision(1))
        d_results['recall'].append(evl.recall(1))
        d_results['f-score'].append(evl.f_measure(1))
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

    save = pout.buffer_content()

    check_folder_or_create(path_folder_save_results + '/' + 'prediction')

    with open(
            path_folder_save_results + '/' + 'prediction/' + name +
            'pred_data.csv', 'w') as f:
        f.write(save)

    buffer_save = pd.read_csv(path_folder_save_results + '/' + 'prediction/' +
                              name + 'pred_data.csv',
                              index_col=False,
                              header=None)

    col_label = buffer_save[1]
    col_prediction = buffer_save[2]
    col_different = buffer_save[3]

    create_prediction(col_label, col_prediction, col_different, indicator_col,
                      images, name, path_folder_save_results + '/prediction/')

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv',
                     index=False)
Ejemplo n.º 25
0
def experiment_more_file(path_files,
                         path_folder_save_results,
                         fold,
                         options,
                         classifier,
                         random,
                         name,
                         voting=False):
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    file_list = os.listdir(path_files)

    for file in file_list:
        if ".csv" not in file:
            file_list.remove(file)

    d_results = {
        'name_file': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'precision': [],
        'recall': [],
        'f-score': [],
        'confusion_matrix': []
    }

    for file in file_list:
        indicator_table = pd.read_csv(path_files + '/indicator/' + file[0] +
                                      '_indicator.csv')
        indicator = list(indicator_table['indicator'])
        images = list(indicator_table['image'])

        data = converters.load_any_file(path_files + "/" + file)

        data.class_is_last()

        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")

        evl = Evaluation(data)

        evl.crossvalidate_model(cls, data, fold, Random(random), pout)

        d_results['name_file'].append(str(file))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['precision'].append(evl.precision(1))
        d_results['recall'].append(evl.recall(1))
        d_results['f-score'].append(evl.f_measure(1))
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

        save = pout.buffer_content()

        check_folder_or_create(path_folder_save_results + '/' + name + '/' +
                               'prediction')

        with open(
                path_folder_save_results + '/' + name + '/' +
                'prediction/pred_data.csv', 'w') as f:
            f.write(save)

        buffer_save = pd.read_csv(path_folder_save_results + '/' + name + '/' +
                                  'prediction/pred_data.csv',
                                  index_col=False)

        col_label = buffer_save['actual']
        col_prediction = buffer_save['predicted']
        col_different = buffer_save['error']

        create_prediction(
            col_label, col_prediction, col_different, indicator, images,
            file[:-4], path_folder_save_results + '/' + name + '/prediction/')

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv")
Ejemplo n.º 26
0
def train(training_dataset_path, model_cache_file_name, evaluation_is_on,
          summary_file_path):
    """Model Training function

    The function uses the WEKA machine learning library, implemented by
    python-weka-wrapper Python library. Divides the data into given
    folds, and do the training and evaluation. Trained model copied to __predictors global variable
    and also saved (together with training data set) to the model_cache_file_name file. Evaluation summary is being written to summary_file_path file.

    Args:
        :param training_dataset_path: the path of the input arff file.
        :param model_cache_file_name:
        :param evaluation_is_on: run evaluation after training (true / false)
        :param summary_file_path: the path of the model evaluation summary file.

    Returns:
        None
    """

    global __classifiers
    global __predictors

    training_data = converters.load_any_file(training_dataset_path)
    training_data.class_is_last()

    lines = []
    summaries = []
    summary_line = [
        'Model'.ljust(16), 'Precision'.ljust(12), 'Recall'.ljust(12),
        'F-measure'.ljust(12), 'Accuracy'.ljust(12), 'FPR'.ljust(12)
    ]
    summaries.append('\t'.join(summary_line))

    for classifier, option_str in __classifiers.items():
        option_list = re.findall(r'"(?:[^"]+)"|(?:[^ ]+)', option_str)
        option_list = [s.replace('"', '') for s in option_list]

        classifier_name = classifier.split('.')[-1]
        info_str = "Using classifier: {classifier}, options: {options}".format(
            classifier=classifier_name, options=str(option_list))
        localizer_log.msg(info_str)
        lines.append(info_str)

        # Train
        cls = Classifier(classname=classifier, options=option_list)
        localizer_log.msg("Start building classifier")
        cls.build_classifier(training_data)
        localizer_log.msg("Completed building classifier")
        localizer_log.msg("Saving trained model to {model_cache_name}".format(
            model_cache_name=model_cache_file_name))

        # localizer_config.save_model(cls, training_data, model_cache_file_name)
        path = os.path.join('caches', 'model')
        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)
        path = os.path.join(path, model_cache_file_name + '.cache')
        cls.serialize(path)
        localizer_log.msg("Trained model saved")

        classifier2, _ = Classifier.deserialize(path)
        print(classifier2)

        __predictors[classifier_name] = cls

        if evaluation_is_on:

            # Model evaluation
            localizer_log.msg("Start evaluation classifier")
            evl = Evaluation(training_data)
            localizer_log.msg("Complete evaluation classifier")

            localizer_log.msg("Start cross-validating classifier")
            evl.crossvalidate_model(cls, training_data, 10, Random(1))
            localizer_log.msg("Complete cross-validating classifier")

            # print(evl.percent_correct)
            # print(evl.summary())
            # print(evl.class_details())

            lines.append(evl.summary())
            lines.append(evl.class_details())

            summary_line = []
            summary_line.append(classifier_name.ljust(16))
            summary_line.append("{:.3f}".format(evl.weighted_precision *
                                                100).ljust(12))
            summary_line.append("{:.3f}".format(evl.weighted_recall *
                                                100).ljust(12))
            summary_line.append("{:.3f}".format(evl.weighted_f_measure *
                                                100).ljust(12))
            summary_line.append("{:.3f}".format(evl.percent_correct).ljust(12))
            summary_line.append("{:.3f}".format(
                evl.weighted_false_positive_rate * 100).ljust(12))
            summaries.append('\t'.join(summary_line))

            # Save evaluation summary to file
            with open(summary_file_path, 'w') as f:
                f.writelines('\n'.join(lines))
                f.writelines('\n' * 5)
                f.writelines('\n'.join(summaries))
Ejemplo n.º 27
0
"""
Naive Bayes in Weka
Created on Sun Jul 03 15:49:46 2016

@author: SkYe
"""

import weka.core.jvm as jvm
jvm.start(max_heap_size="2500m")


# Load data: Must be a weka-derived object
# Dataset has nominal and numeric variables
import weka.core.converters as converters
data_dir = "data/"
data = converters.load_any_file(data_dir + "adult.csv")
data.class_is_last()


# Create train and test sets
from weka.core.classes import Random
test, train =  data.train_test_split(0.90, Random(1))

# Check data in datasets
print(train.num_instances)
print(test.num_instances)

# Check data in datasets
print(train.num_attributes)
print(test.num_attributes)
Ejemplo n.º 28
0
import weka.core.jvm as jvm
import weka.core.converters as conv
from weka.classifiers import Evaluation, Classifier
from weka.core.classes import Random
import weka.plot.classifiers as plcls
import os

jvm.start(packages=True)
data = conv.load_any_file("Dataset/test.arff")
#print(data)

data.class_is_last()
cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 15, Random(1))

#print(evl.summary("=== J48 on anneal (stats) === Rafael Manja", False))
#print(evl.matrix("Matriz do Rafael"))
plcls.plot_classifier_errors(evl.predictions, absolute=False, wait=True)
jvm.stop()
Ejemplo n.º 29
0
 def loadData(self, fName, temp=True):
     if temp:
         data = converters.load_any_file(fName)
     else:
         data = converters.load_any_file(os.path.join(self.dataDir, fName))
     return data
Ejemplo n.º 30
0
        NewListColumns.append(ListColumns[i])

    print("NewListColumns :")
    print(NewListColumns)
    for l in (NewListColumns):

        data_file = 'C:/PythonProjects/AgentsTurboFan/Test/Test4/Agent' + l + '.csv'

        test = []
        test.append(l)
        print("test :")
        print(test)

        print("\n--> loading:\n")
        print(data_file)
        dataA = load_any_file(data_file)
        dataA.class_is_last()

        DFC = pd.read_csv('C:/PythonProjects/AgentsTurboFan/Test/Test4/Agent' +
                          l + '.csv',
                          delimiter=",")
        for a in range(len(DFC)):
            classvar = DFC.iloc[a, len(DFC.columns) - 1]
            classvarStr = str(classvar)

        print('classvarStr :', classvarStr)
        print('isreal(classvarStr) :', isreal(classvarStr))

        if isreal(classvarStr) == True:

            classifier = Classifier(classname="weka.classifiers.trees.M5P",
Ejemplo n.º 31
0
def CrossValidateFullDataset():
	#Tests a classifier performance with 10x cross-validation

	data_dir = "test/"
	print "Loading Dataset..."
	data = converters.load_any_file(data_dir + "full_dataset.csv")
	print "Dataset Loaded!"
	
	#Set class attribute
	data.class_is_last()


	cls_classes = [#"weka.classifiers.trees.J48",
					"weka.classifiers.trees.RandomForest",
					#"weka.classifiers.lazy.IBk"
				]

	classifiers = []
	for cls in cls_classes:
		classifiers.append(Classifier(classname=cls))


	#Regex for attribute selection
	#(Useful for testing different combinations of attributes)
	identifier_att = ".*id.*"
	
	#timeseries_att = "raw.*"
	rmNoise_att = "rmNoise.*"
	#doppler_att = "doppler.*"
	#phase_att = "phase.*"
	#music_att = "music.*"
	#beamform_att = "beamform.*"
	#music_sliding_att = "music_sliding.*"
	#music_agg_att = "music_agg.*"
	#music_angles_att = "music_angles.*"

	att_set = [rmNoise_att]

	##################################################
	#Remove instances identifier attribute
	data = FilterAttribute(identifier_att,data)
	################################################


	for att_comb in powerset(att_set):
		data_filtered = data

		for att in att_comb:
			if(len(att) != len(att_set)):
				data_filtered = FilterAttribute(att,data_filtered)
		if str(list(set(att_set) - set(att_comb)))=='[]':
			continue
		print att_set
		print att_comb
		print colored("======================================================",'green')
		print colored("Full attribute set: " + str(att_set),'green')
		print colored("Removed attributes: " + str(att_comb),'red')
		if(len(att_comb) > 0):
			print colored("Using attributes: " + str(list(set(att_set) - set(att_comb))), 'green')
		print colored("======================================================",'green')

		print data_dir

		for i, cls in enumerate(classifiers):

			evl = Evaluation(data_filtered)
			evl.crossvalidate_model(cls, data_filtered, 10, Random(1))

			print colored("=> 10x cross-validation for " + cls_classes[i], 'red')
			print(evl.summary())
			print(evl.matrix())
# package install
chisq_name = "EvolutionarySearch"
chisq_installed = False
for p in pkg.installed_packages():
    if p.name == chisq_name:
        chisq_installed = True
if not chisq_installed:
    pkg.install_package(chisq_name)
    print("pkg %s installed, please restart" % chisq_name)
    jvm.stop()
    sys.exit(1)
"""
data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\Data driven model _paper [June 25_2018\\FeatureSelection\\EvlSearch\\"
globbed_files = glob.glob(data_dir + "*.csv")
for csv in globbed_files:
    data = converters.load_any_file(csv)
    data.class_is_last()
    search = ASSearch(classname="weka.attributeSelection.EvolutionarySearch",
                      options=[
                          "-population-size", "200", "-generations", "500",
                          "-crossover-probability", "0.6"
                      ])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "E", "1"])
    attsel = AttributeSelection()
    attsel.folds(10)
    attsel.crossvalidation(True)
    attsel.seed(1)
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
Ejemplo n.º 33
0
def experiment_sequential_file(path_indices, path_features,
                               path_folder_save_results, options, classifier,
                               name):
    ind_f = load(path_indices)

    lst = ind_f.files

    for item in lst:
        ind = ind_f[item] + 1

    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    data = converters.load_any_file(path_features)

    ind = np.append(ind, len(data))

    data.class_is_last()

    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")

    d_results = {
        'index': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }

    for j in range(len(ind) - 1):
        print(j)

        print(str(ind[j]) + '-' + str(ind[j + 1]))

        d_test = data.subset(row_range=str(ind[j]) + '-' + str(ind[j + 1]))

        if j == 0:  # first
            d_train = data.subset(row_range=str(ind[j + 1] + 1) + '-' +
                                  str(ind[-1]))  # last element
        elif j == len(ind) - 2:  # last
            d_train = data.subset(row_range='1-' +
                                  str(ind[j] - 1))  # last element
        else:  # central
            s = '1-' + str(ind[j] - 1) + ',' + str(ind[j + 1] + 1) + '-' + str(
                ind[-1])
            d_train = data.subset(row_range=s)

        cls.build_classifier(d_train)

        evl = Evaluation(data)
        evl.test_model(cls, d_test, pout)

        save = pout.buffer_content()

        with open(
                path_folder_save_results + '/' + '/prediction/' + name +
                str(j) + 'pred_data.csv', 'w') as f:
            f.write(save)

        d_results['index'].append(str(ind[j]))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv',
                     index=False)