def showAttributeRanking(self, data):
     search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"])
     evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval")
     attsel = AttributeSelection()
     attsel.set_search(search)
     attsel.set_evaluator(evaluator)
     attsel.select_attributes(data)
     print("# attributes: " + str(attsel.get_number_attributes_selected()))
     print("attributes: " + str(attsel.get_selected_attributes()))
     print("result string:\n" + attsel.to_results_string())
Beispiel #2
0
def Feature_Selection(infile):
    directory = os.getcwd() + '/'
    csvpath = directory + infile

    jvm.start(packages=True, max_heap_size="4g")
    print "\n\n"
    print "Loaded file: ", infile
    csvloader = Loader(classname="weka.core.converters.CSVLoader")
    csvdata = csvloader.load_file(csvpath)

    remover = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                     options=["-R", " 1"])
    remover.inputformat(csvdata)
    filtered_data = remover.filter(csvdata)
    filtered_data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "-E", "1"])
    attribs = AttributeSelection()
    attribs.search(search)
    attribs.evaluator(evaluator)
    attribs.select_attributes(filtered_data)
    print "Summary of Attribute Selection: "
    print attribs.results_string
    jvm.stop()
    return
def use_low_level(data):
    """
    Uses the attribute selection API directly.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n3. Low-level")
    attsel = AttributeSelection()
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    attsel.jwrapper.setEvaluator(aseval.jobject)
    attsel.jwrapper.setSearch(assearch.jobject)
    attsel.select_attributes(data)
    indices = attsel.selected_attributes
    print("selected attribute indices (starting with 0):\n" + str(indices.tolist()))
Beispiel #4
0
 def featureSelection(self):
     alg_search = ASSearch(
         classname="weka.attributeSelection.GeneticSearch",
         options=["-Z", "1024", "-G", "20", "-C", "0.6", "-M", "0.3"])
     alg_evaluation = ASEvaluation(
         classname="weka.attributeSelection.CfsSubsetEval",
         options=["-P", "1", "-E", "1"])
     feature_selection = AttributeSelection()
     feature_selection.search(alg_search)
     feature_selection.evaluator(alg_evaluation)
     feature_selection.select_attributes(self.original_data)
     self.selected_features = feature_selection.selected_attributes
     self.num_features = feature_selection.number_attributes_selected
     self.data_selected = feature_selection.reduce_dimensionality(
         self.original_data)
Beispiel #5
0
def cfs(table, cores):
    loader = Loader("weka.core.converters.CSVLoader")
    anneal_data = loader.load_file(table)
    anneal_data.class_is_last()
    logger.info("Running attribute selection for: " + str(table.split("/")[-1]) + ". Please, wait a moment.")
    search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "0", "-N", "5"])
    evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-Z", "-P", cores, "-E", cores])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    logger.info("Selected attributes: " + str(attsel.selected_attributes))
    anneal_data.delete(index=None) # TO-DO: Borrar instancias aun no funciona

    return list(attsel.selected_attributes)
Beispiel #6
0
def relieff(filter_data, feature_names):
    # define search and evaluation for ReliefF
    search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    # last param is number of nearest neighbors
    evaluation = ASEvaluation(classname="weka.attributeSelection.ReliefFAttributeEval",
                              options=["-M", "-1", "-D", "1", "-K", "10"])

    # run the ReliefF alg
    relieff = AttributeSelection()
    relieff.search(search)
    relieff.evaluator(evaluation)
    relieff.select_attributes(filter_data)
    results = relieff.selected_attributes

    # weka wrapper returns the class col number with the results, so slice -1
    return [feature_names[i] for i in results[:-1]]
def feature_selection_weka(x_train, y_train, x_test, input_path, features):
    percent = int(x_train.shape[1] * (features / 100.0))
    if not os.path.exists('Weka'):
        os.mkdir('Weka')

    if not os.path.exists(input_path +
                          f'selected_features_weka_{features}.csv'):
        x_train = x_train.loc[:, (x_train != x_train.iloc[0]).any()]
        sava_data = x_train.copy()
        sava_data.columns = [str(a) + "a" for a in range(sava_data.shape[1])]
        sava_data['target'] = y_train
        sava_data.to_csv('Weka/train_weka_format.csv', index=False)

        from weka.attribute_selection import ASEvaluation, AttributeSelection, ASSearch
        from weka.core.converters import Loader, Saver
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file('Weka/train_weka_format.csv',
                                class_index='last')

        search = ASSearch(classname="weka.attributeSelection.GreedyStepwise",
                          options=["-C", "-R", "-N", f"{percent}"])
        evaluator = ASEvaluation(
            classname="weka.attributeSelection.CfsSubsetEval",
            options=["-P", "1", "-E", "1", "-L"])
        attsel = AttributeSelection()
        attsel.search(search)
        attsel.evaluator(evaluator)
        attsel.select_attributes(data)
        ranked_attributes = pd.DataFrame(attsel.ranked_attributes,
                                         columns=['Feature', 'Rank'])
        ranked_attributes['Feature'] = ranked_attributes['Feature'].astype(int)
        set_of_features = ranked_attributes.loc[:percent - 1, 'Feature']

        x_train.iloc[:, set_of_features].to_csv(
            input_path + f'selected_features_weka_{features}.csv')
        selected_features = x_train.iloc[:, set_of_features].columns
    else:
        selected_features = pd.read_csv(
            input_path + f'selected_features_weka_{features}.csv',
            index_col=0).columns

    x_train_filtered = x_train.loc[:, selected_features]
    x_val_filtered = x_test.loc[:, selected_features]

    return x_train_filtered, x_val_filtered
Beispiel #8
0
def information_gain(filter_data, feature_names):
    # last param determines how many attributes are returned
    # 2nd param controls the score threshold
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    # has no params
    evaluation = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval", options=[])

    # run the Information Gain alg
    info_gain = AttributeSelection()
    info_gain.search(search)
    info_gain.evaluator(evaluation)
    info_gain.select_attributes(filter_data)
    results = info_gain.selected_attributes

    # weka wrapper returns the class col number with the results, so slice -1
    return [feature_names[i] for i in results[:-1]]
Beispiel #9
0
def get_IG(ofile_dir, loader):
	data = loader.load_file(ofile_dir)
	data.class_is_last()

	evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval")
	search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"])
	attsel = AttributeSelection()
	attsel.search(search)
	attsel.evaluator(evaluator)

	attsel.select_attributes(data)

	results = {}

	if attsel.number_attributes_selected < 2:
		flag = 0
		output = attsel.results_string
		for i in output.split('\n'):
			if (flag != 0):
				if len(i.split(' '))>2:
					t=[]
					for f in i.split(' '):
						if f!='':
							t.append(f)
					r_tax = ''
					for c in range(len(t)):
						if c>1:
							r_tax = r_tax+t[c]+' '
					results.update({str(r_tax.strip()): float(t[0].strip())})
				else:
					break
			if "Ranked attributes" in i:
				flag = 1
		mean_score = sum(results.values())/len(results.values())
		os.system("rm -r "+ofile_dir)
	else:
		results = dict([(str(data.attribute(attr[0]).name), attr[1]) for attr in attsel.ranked_attributes])
		mean_score = attsel.ranked_attributes[:,1].mean()
	
	return results, mean_score
Beispiel #10
0
def main(argv):
    """
    Just runs some example code.
    """
    if len(argv)!=1:
       print "input ARFF file path missing to process...Exiting"
       print argv, len(argv)
       usage()
       sys.exit()

    # load a dataset
    # anneal_file = helper.get_data_dir() + os.sep + "anneal.arff"
    # helper.print_info("Loading dataset: " + anneal_file)
    #arff_file_path=argv[0]
    #anneal_file = "/root/Desktop/sunil/pyweka_rd/sample.arff"
    anneal_file = argv[0]
    #helper.print_info("Loading dataset: " + anneal_file)
    print("Loading dataset: " + anneal_file)
    loader = Loader("weka.core.converters.ArffLoader")
    anneal_data = loader.load_file(anneal_file)
    anneal_data.class_is_last()

    # perform attribute selection
    # helper.print_title("Attribute selection")
    print("Attribute selection")
    search = ASSearch(classname="weka.attributeSelection.Ranker", options=["Forward", "1", "5"])
    evaluation = ASEvaluation(classname="weka.attributeSelection.ReliefFAttributeEval", options=["10", "-1", "1", "2", "False"])
    #evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("Result string:\n" + attsel.results_string)

    """
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    anneal_file = helper.get_data_dir() + os.sep + "anneal.arff"
    helper.print_info("Loading dataset: " + anneal_file)
    loader = Loader("weka.core.converters.ArffLoader")
    anneal_data = loader.load_file(anneal_file)
    anneal_data.class_is_last()

    # perform attribute selection
    helper.print_title("Attribute selection")
    search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"])
    evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("result string:\n" + attsel.results_string)

    # perform ranking
    helper.print_title("Attribute ranking (2-fold CV)")
    search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-N", "-1"])
    evaluation = ASEvaluation("weka.attributeSelection.InfoGainAttributeEval")
    attsel = AttributeSelection()
    attsel.ranking(True)
    attsel.folds(2)
    attsel.crossvalidation(True)
    attsel.seed(42)
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("ranked attributes:\n" + str(attsel.ranked_attributes))
    print("result string:\n" + attsel.results_string)
Beispiel #12
0
      "weka.classifiers.trees.J48"], ["-D", "1", "-N", "2"]),
    (["-F", "10", "-T", "-1", "-B",
      "weka.classifiers.trees.J48"], ["-D", "2", "-N", "2"]),
)

# attribute selection
for setup in setups:
    evl, search = setup
    aseval = ASEvaluation(
        classname="weka.attributeSelection.WrapperSubsetEval", options=evl)
    assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                        options=search)
    print("\n--> Attribute selection\n")
    print(aseval.to_commandline())
    print(assearch.to_commandline())
    attsel = AttributeSelection()
    attsel.evaluator(aseval)
    attsel.search(assearch)
    attsel.select_attributes(data)
    print(attsel.results_string)

# cross-validation
aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval",
                      options=["-F", "10", "-B", "weka.classifiers.trees.J48"])
assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                    options=["-D", "0", "-N", "5"])
print("\n--> Attribute selection (cross-validation)\n")
print(aseval.to_commandline())
print(assearch.to_commandline())
attsel = AttributeSelection()
attsel.evaluator(aseval)
    # PREPROCESS - turn numeric to nominal
    numToNom = Filter(
        classname="weka.filters.unsupervised.attribute.NumericToNominal",
        options=["-R", "first-last"])
    numToNom.inputformat(data)
    data = numToNom.filter(data)
    data.class_is_last()
    print(data.summary(data))

    # Use CFS to find subset of attributes
    print("CfsSubsetEval Attribute Selection")
    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "11"])
    evaluation = ASEvaluation(
        classname="weka.attributeSelection.CfsSubsetEval")
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("result string:\n" + attsel.results_string)

    # Find indicies of attributes to delete
    attributes_to_delete = []
    for x in range(0, data.num_attributes):
        if x not in attsel.selected_attributes and x != data.class_index:
            attributes_to_delete.append(x)

    # Create copy of data and delete unneccassary attributes
    dataSubset = data.copy_instances(data)
Beispiel #14
0
def select_attribute(file):
    global Field50
    global Field10
    global Field5
    global Field2
    global a

    filename = file.parts[-1]  # Get filename from Pathlib object
    dir = file.parents[0]  # Data directory currently in

    print("Selecting attributes from %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    filename_base = filename[:-5]  # Removes '.arff' from filename
    data = load_Arff_file(file)  # Load data from arff
    data.class_is_first()  # Set first attr as class

    # Define Attribute selection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "0.01", "-N", "-1"])
    # Define Attribute Evaluator
    evaluator = ASEvaluation(
        classname="weka.attributeSelection.CorrelationAttributeEval",
        options=[])

    # Run attribution selection
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)

    # Define filepath and output results
    attsel_output = filename_base + "_attsel_results.txt"
    output_select_attribute(attsel, dir / attsel_output)

    # Debug Analysis
    print(attsel.selected_attributes)
    for i in range(2):
        Field2.append(attsel.selected_attributes[i])
    for i in range(5):
        Field5.append(attsel.selected_attributes[i])
    for i in range(10):
        Field10.append(attsel.selected_attributes[i])
    for i in range(50):
        Field50.append(attsel.selected_attributes[i])
    print(Field2)
    print(Field5)
    print(Field10)
    print(Field50)

    if len(set(Field10)) == len(Field10):
        print("no duplicates found")

    else:
        print("duplicate found")
        Field50 = list(set(Field50))
        Field10 = list(set(Field10))
        Field5 = list(set(Field5))
        Field2 = list(set(Field2))
                                            'd_FOLDS_test_' + str(fold) +
                                            '.csv')
                dataTrain.class_is_last()
                dataTest.class_is_last()

                from weka.attribute_selection import AttributeSelection, ASEvaluation, ASSearch
                search = ASSearch(
                    classname="weka.attributeSelection.RerankingSearch"
                )  #,options=["-method", "2"])
                evaluator = ASEvaluation(
                    classname='weka.attributeSelection.ClassifierAttributeEval',
                    options=['-B', 'weka.classifiers.bayes.NaiveBayes'])

                Eval = AttributeSelection(
                    classname='weka.attributeSelection.ClassifierAttributeEval',
                    options=[
                        '-B', 'weka.classifiers.bayes.NaiveBayes', '--',
                        "-S 'weka.attributeSelection.RerankingSearch -method 2'"
                    ])

                from weka.filters import Filter

                NominalToBinary = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NominalToBinary",
                    options=["-R", "5,7,8"])
                NumericToNominal = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NumericToNominal")
                ReplaceMV = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.ReplaceMissingValues")
Beispiel #16
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
Beispiel #17
0
for statisticFile in statisticsList:
    statisticFilePath = (os.path.join(statisticsDataFolder, statisticFile))
    print(statisticFilePath)

    if statisticFile.endswith("blues.csv"):
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file(statisticFilePath)
        data.class_is_last()
        print data

        if atributeEvaluator == "PrincipalComponents":
            evaluator = ASEvaluation(
                classname="weka.attributeSelection.PrincipalComponents")
            search = ASSearch(classname="weka.attributeSelection.Ranker")
            attsel = AttributeSelection()
            attsel.search(search)
            attsel.evaluator(evaluator)
            print(attsel.select_attributes(data))
            print("# attributes: " + str(attsel.number_attributes_selected))
            num_attributes = attsel.number_attributes_selected
            print("attributes: " + str(attsel.selected_attributes))
            y = attsel.selected_attributes
            results_pca = attsel.results_string

        write_file = "PCA_blues.csv"
        with open(pcaDataSaving + "\\" + write_file, "w") as output:
            for line in results_pca:
                output.write(line)

        #Adding Eigenvectors Value to create a matrix
Beispiel #18
0
        ["-F", "10", "-T", "-1", "-B", "weka.classifiers.trees.J48"],
        ["-D", "2", "-N", "2"]
    ),
)

# attribute selection
for setup in setups:
    evl, search = setup
    aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval",
                          options=evl)
    assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                        options=search)
    print("\n--> Attribute selection\n")
    print(aseval.to_commandline())
    print(assearch.to_commandline())
    attsel = AttributeSelection()
    attsel.evaluator(aseval)
    attsel.search(assearch)
    attsel.select_attributes(data)
    print(attsel.results_string)

# cross-validation
aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval",
                      options=["-F", "10", "-B", "weka.classifiers.trees.J48"])
assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                    options=["-D", "0", "-N", "5"])
print("\n--> Attribute selection (cross-validation)\n")
print(aseval.to_commandline())
print(assearch.to_commandline())
attsel = AttributeSelection()
attsel.evaluator(aseval)
Beispiel #19
0
# cross-validate classifiers
for classifier in classifiers:
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("%s: %0.0f%%" % (classifier, evl.percent_correct))

# wrapper
for classifier in classifiers:
    aseval = ASEvaluation(
        classname="weka.attributeSelection.WrapperSubsetEval",
        options=["-B", classifier])
    assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                        options=[])
    attsel = AttributeSelection()
    attsel.evaluator(aseval)
    attsel.search(assearch)
    attsel.select_attributes(data)
    reduced = attsel.reduce_dimensionality(data)

    cls = Classifier(classname=classifier)
    evl = Evaluation(reduced)
    evl.crossvalidate_model(cls, reduced, 10, Random(1))
    print("%s (reduced): %0.0f%%" % (classifier, evl.percent_correct))

# meta-classifier
for wrappercls in classifiers:
    for basecls in classifiers:
        meta = SingleClassifierEnhancer(
            classname="weka.classifiers.meta.AttributeSelectedClassifier")
Beispiel #20
0
]

# cross-validate classifiers
for classifier in classifiers:
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("%s: %0.0f%%" % (classifier, evl.percent_correct()))

# wrapper
for classifier in classifiers:
    aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval",
                          options=["-B", classifier])
    assearch = ASSearch(classname="weka.attributeSelection.BestFirst",
                        options=[])
    attsel = AttributeSelection()
    attsel.set_evaluator(aseval)
    attsel.set_search(assearch)
    attsel.select_attributes(data)
    reduced = attsel.reduce_dimensionality(data)

    cls = Classifier(classname=classifier)
    evl = Evaluation(reduced)
    evl.crossvalidate_model(cls, reduced, 10, Random(1))
    print("%s (reduced): %0.0f%%" % (classifier, evl.percent_correct()))

# meta-classifier
for wrappercls in classifiers:
    for basecls in classifiers:
        meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier")
        meta.set_options(
Beispiel #21
0
 def showAttributeRanking(self, data):
     search = ASSearch(
         classname="weka.attributeSelection.Ranker",
         options=["-T", "-1.7976931348623157E308", "-N", "-1"])
     evaluator = ASEvaluation(
         classname="weka.attributeSelection.InfoGainAttributeEval")
     attsel = AttributeSelection()
     attsel.set_search(search)
     attsel.set_evaluator(evaluator)
     attsel.select_attributes(data)
     print("# attributes: " + str(attsel.get_number_attributes_selected()))
     print("attributes: " + str(attsel.get_selected_attributes()))
     print("result string:\n" + attsel.to_results_string())
def all_feature(file):
    jvm.start(packages=True)
    data = converters.load_any_file(file)
    data.class_is_last()

    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-T", "-1.7976931348623157E308", "-N", "-1"])
    attsel = AttributeSelection()
    attsel.search(search)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.ChiSquaredAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    chi = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.InfoGainAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    info_gain = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.GainRatioAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    gain_ratio = t.astype(int)

    evaluator = ASEvaluation(
        classname="weka.attributeSelection.SymmetricalUncertAttributeEval")
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    t = attsel.ranked_attributes[:, 0]
    symmetric_uncertainty = t.astype(int)

    jvm.stop()

    return chi, info_gain, gain_ratio, symmetric_uncertainty
    jvm.stop()
    sys.exit(1)
"""
data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\Data driven model _paper [June 25_2018\\FeatureSelection\\EvlSearch\\"
globbed_files = glob.glob(data_dir + "*.csv")
for csv in globbed_files:
    data = converters.load_any_file(csv)
    data.class_is_last()
    search = ASSearch(classname="weka.attributeSelection.EvolutionarySearch",
                      options=[
                          "-population-size", "200", "-generations", "500",
                          "-crossover-probability", "0.6"
                      ])
    evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval",
                             options=["-P", "1", "E", "1"])
    attsel = AttributeSelection()
    attsel.folds(10)
    attsel.crossvalidation(True)
    attsel.seed(1)
    attsel.search(search)
    attsel.evaluator(evaluator)
    attsel.select_attributes(data)
    evl = Evaluation(data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes: " + str(attsel.selected_attributes))
    print("result string:\n" + attsel.results_string)
    print(evl)
    # write the report for each file
    with open(f"{csv}._report.csv", "a") as outfile:
        outfile.write(attsel.results_string)
    #with open(f"{csv}._label.txt","a") as output:
Beispiel #24
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    anneal_file = helper.get_data_dir() + os.sep + "anneal.arff"
    helper.print_info("Loading dataset: " + anneal_file)
    loader = Loader("weka.core.converters.ArffLoader")
    anneal_data = loader.load_file(anneal_file)
    anneal_data.class_is_last()

    # perform attribute selection
    helper.print_title("Attribute selection")
    search = ASSearch(classname="weka.attributeSelection.BestFirst",
                      options=["-D", "1", "-N", "5"])
    evaluation = ASEvaluation(
        classname="weka.attributeSelection.CfsSubsetEval",
        options=["-P", "1", "-E", "1"])
    attsel = AttributeSelection()
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("# attributes: " + str(attsel.number_attributes_selected))
    print("attributes (as numpy array): " + str(attsel.selected_attributes))
    print("attributes (as list): " + str(list(attsel.selected_attributes)))
    print("result string:\n" + attsel.results_string)

    # perform ranking
    helper.print_title("Attribute ranking (2-fold CV)")
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-N", "-1"])
    evaluation = ASEvaluation("weka.attributeSelection.InfoGainAttributeEval")
    attsel = AttributeSelection()
    attsel.ranking(True)
    attsel.folds(2)
    attsel.crossvalidation(True)
    attsel.seed(42)
    attsel.search(search)
    attsel.evaluator(evaluation)
    attsel.select_attributes(anneal_data)
    print("ranked attributes:\n" + str(attsel.ranked_attributes))
    print("result string:\n" + attsel.results_string)