Beispiel #1
0
def load(args):
    """Load the needed data according to the arguments given.
        If needed, given elements are removed or as only ones preserved.

    Args:
        args (object): An argparse Namespace that holds the command-line arguments as its attributes.

    Returns:
        names (array): The names of the data in the rows
        x (array): The data matrix
    """
    names = None
    x = None

    if args.information is None:
        names, x = ld.load_features(args.path,
                                    args.file,
                                    normalize=args.no_normalize,
                                    add_information=args.add_information,
                                    expand=args.expand,
                                    degree=args.degree,
                                    dimensions=args.information)
    else:
        names, x = ld.load_features(args.path,
                                    args.file,
                                    normalize=args.no_normalize,
                                    add_information=args.add_information,
                                    expand=args.expand,
                                    degree=args.degree)

    if args.elements is not None:
        names, x = ei.select_elements_from_data(names, x, args.elements,
                                                args.remove)

    return names, x
Beispiel #2
0
def prepare_data(path,
                 filename_features,
                 filename_energies,
                 normalize=True,
                 expand=False,
                 degree=2,
                 remove=False,
                 elements=[]):
    """Loads and prepares the data necessary for the plotting

    Args:
        path (str): path containing the data
        filename_features (str): name of the features data set file
        filename_energies (str): name of file for the energies associated to the molecules
        normalize (bool): if True, normalizes the data matrix (features)
        expand (bool): if True, makes column-wise expansion of the data matrix (features)
        degree (int):  degree of expansion. Only works if expand=True
        remove (bool): if True, removes from the data the molecules containing the elements in the list elements
        elements (list): if remove=True, then all molecules elements specified in this list will be removed from the data

    Returns:
        names: names of the load molecules
        x: data matrix (features)
        energies: Delta G(Rxn A) energies of the molecules
        not_normalized_data: non normalized data matrix (features)
        min_energy: minimum present in all energies
        max_energy: maximum present in all energies

    Note: all the data is being shuffled (but preserving the order between names, x, energies and not_normalized_data)
    """
    names, x = ld.load_features(path,
                                filename_features,
                                expand=expand,
                                degree=degree,
                                normalize=normalize)
    _, not_normalized_data = ld.load_features(path,
                                              filename_features,
                                              expand=False,
                                              normalize=False)
    energies, energies_names = ld.load_energies(path, filename_energies)

    min_energy = np.min(energies)
    max_energy = np.max(energies)

    if elements is not None and len(elements) > 0:
        names, x = ei.select_elements_from_data(names,
                                                x,
                                                elements,
                                                remove=remove)
        _, not_normalized_data = ei.select_elements_from_data(
            names, not_normalized_data, elements, remove=remove)
        energies_names, energies = ei.select_energies_from_data(energies_names,
                                                                energies,
                                                                elements,
                                                                remove=remove)

    indices = np.random.permutation(names.shape[0])
    return names[indices], x[indices], energies[indices], not_normalized_data[
        indices], min_energy, max_energy
import cross_validation
import load_data
import pandas as pd

data_path = 'Data/białaczka.XLS'
features_path = 'Data/features.txt'

#load features
features = load_data.load_features(features_path)

# load data
(X, y) = load_data.load_data_from_files(data_path, features)

# params
k_best_features = [10, 15, 20]
neurons_in_hidden_layer = [32, 64, 256]
momentum = [0, 0.9]

# data frame
df = pd.DataFrame(columns=[
    'Best features', 'Neurons in hidden layer', 'Momentum', 'Average'
])
for i in range(len(k_best_features)):
    for j in range(len(neurons_in_hidden_layer)):
        for k in range(len(momentum)):
            scores = cross_validation.run_crossvalid(
                X, y, k_best_features[i], neurons_in_hidden_layer[j],
                momentum[k])
            average = 0
            if (len(scores) != 0):
                average = sum(scores) / len(scores)
import load_data
import select_features
import cross_validation
import model_benchmark
import statistical_analysis

no_of_crossvalid_runs = 2
no_of_folds = 5

# load list of features from file. why? because i thought that putting this list in the txt file would be more useful than hardcoing into the array in python code.
features = load_data.load_features()

# scan all files (names hardcoded) from data dir and loads all samples into one big array (actually it's DataFrame, but whatever).
(X_features, Y_diagnosis) = load_data.load_data_from_files(features)

# creates and prints feature ranking using all samples.
#feature_ranking = select_features.create_feature_ranking(X_features, Y_diagnosis)
#print(feature_ranking)

# run and print score of one cross_validation with sample params.
#score = cross_validation.run_crossvalid(X_features, Y_diagnosis, 2, 3, 5, 'manhattan', 420)
#print(score)

# run a function that tests diffrent set of parameters for knn cross_validation
run_results = model_benchmark.run(X_features, Y_diagnosis,
                                  no_of_crossvalid_runs, no_of_folds)
print(run_results)

analysis_result = statistical_analysis.run(run_results, no_of_crossvalid_runs,
                                           no_of_folds)
print(analysis_result)