Exemple #1
0
def get_statistics_independently(arff_file):
    matrix, labels, relation, attributes = am.arff_to_nparray(arff_file)
    classes = list(set(labels))
    labels = labels.reshape(-1, 1)
    folder, name = os.path.split(arff_file)
    if folder == "":
        folder = os.getcwd()
    stats_names = [
        'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew',
        'percentile25', 'percentile50', 'percentile75'
    ]

    for stat in stats_names:
        indices = []
        subname = name.replace(".arff", "_%s" % stat)
        for attribute in attributes:
            if attribute.endswith(stat):
                indices.append(attributes.index(attribute))
        submatrix = np.concatenate((matrix[:, indices], labels), axis=-1)
        subheader = np.concatenate(
            (np.array(attributes)[indices], np.array(["Class"])),
            axis=-1).reshape(1, -1)
        am.create_arff(
            np.concatenate((subheader, submatrix), axis=0).tolist(), classes,
            folder, subname, subname)
Exemple #2
0
def get_statistics(features_folder, output_path=None, statistics=["mean"]):

    if output_path is None:
        output_path = os.path.join(features_folder,
                                   os.path.split(features_folder)[1])
    features = os.path.split(features_folder)[1]
    analyzed_files = []
    matrix = []

    classes = sorted([
        f for f in os.listdir(features_folder)
        if os.path.isdir(os.path.join(features_folder, f))
        and not f.startswith('.')
    ],
                     key=lambda f: f.lower())

    for class_name in classes:
        files = sorted([
            f for f in os.listdir(os.path.join(features_folder, class_name))
            if os.path.isfile(os.path.join(features_folder, class_name, f))
            and not f.startswith('.') and f[-4:].lower() == ".csv"
        ],
                       key=lambda f: f.lower())
        analyzed_files += ["%s,%s" % (file, class_name) for file in files]
        for feat_file in files:
            df = pandas.read_csv(os.path.join(features_folder, class_name,
                                              feat_file),
                                 header=None)
            feature_names = df.columns.values
            feature_names = [
                "%s_%s" % (features, num) for num in feature_names
            ]
            vals = df.values
            header = []
            data = []
            for statistic in statistics:
                if statistic == "max":
                    values = np.nanmax(vals, axis=0)
                elif statistic == "min":
                    values = np.nanmin(vals, axis=0)
                elif statistic == "mean":
                    values = np.nanmean(vals, axis=0)
                elif statistic == "median":
                    values = np.nanmedian(vals, axis=0)
                elif statistic == "std":
                    values = np.nanstd(vals, axis=0)
                elif statistic == "var":
                    values = np.nanvar(vals, axis=0)
                elif statistic == "kurt":
                    values = scipy.stats.kurtosis(vals, axis=0)
                elif statistic == "skew":
                    values = scipy.stats.skew(vals, axis=0)
                elif statistic == 'percentile25':
                    values = np.nanpercentile(vals, 25, axis=0)
                elif statistic == 'percentile50':
                    values = np.nanpercentile(vals, 50, axis=0)
                elif statistic == 'percentile75':
                    values = np.nanpercentile(vals, 75, axis=0)

                header += [
                    "%s_%s" % (name, statistic) for name in feature_names
                ]
                data.append(values)
            instance = np.concatenate(tuple(data), axis=-1).tolist()
            instance.append(class_name)
            matrix.append(instance)
            print("%s analyzed." % feat_file)
    header.append("Class")
    matrix = [header] + matrix
    am.create_arff(
        matrix, classes,
        os.path.split(output_path)[0],
        os.path.split(output_path)[1] + "_%s" % "_".join(statistics),
        os.path.split(output_path)[1] + "_statistics")
    print("Statistics from %s obtained." % os.path.split(output_path)[1])
    with open(output_path + ".txt", "w+") as files:
        files.write("\n".join(analyzed_files))
Exemple #3
0
def get_statistics_per_category(databaseFolder, processedDataFolder=None):

    if processedDataFolder == None:
        processedDataFolder = "datasets/visual"

    classes = sorted([
        f for f in os.listdir(databaseFolder)
        if os.path.isdir(os.path.join(databaseFolder, f))
        and not f.startswith('.')
    ],
                     key=lambda f: f.lower())
    stats_names = [
        'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew',
        'percentile25', 'percentile50', 'percentile75'
    ]

    categoryDictionary = {
        "gaze": ["gaze_"],
        "eye_landmarks": ["eye_lmk_"],
        "head": ["pose_"],
        "facial_landmarks": ["x_", "y_"],
        "au_intensity": ["_r"],
        "au_presence": ["_c"]
    }
    for category in categoryDictionary.keys():
        startFlag = True
        analyzedFiles = []
        for className in classes:
            files = sorted([
                f for f in os.listdir(os.path.join(databaseFolder, className))
                if os.path.isfile(os.path.join(databaseFolder, className, f))
                and not f.startswith('.') and f[-4:].lower() == ".csv"
            ],
                           key=lambda f: f.lower())
            analyzedFiles += ["%s,%s" % (file, className) for file in files]
            for feat_file in files:
                mm_feats = []
                mm_names = []
                df = pandas.read_csv(os.path.join(databaseFolder, className,
                                                  feat_file),
                                     header='infer')
                feature_names = df.columns.values
                for feat in feature_names:
                    reference = categoryDictionary.get(category)
                    for string in reference:
                        if feat.strip().lower().startswith(string) \
                                or feat.strip().lower().endswith(string):
                            # Feature vector
                            vals = df[feat].values
                            # Run statistics
                            maximum = np.nanmax(vals)
                            minimum = np.nanmin(vals)
                            mean = np.nanmean(vals)
                            median = np.nanmedian(vals)
                            std = np.nanstd(vals)
                            var = np.nanvar(vals)
                            kurt = scipy.stats.kurtosis(vals)
                            skew = scipy.stats.skew(vals)
                            percentile25 = np.nanpercentile(vals, 25)
                            percentile50 = np.nanpercentile(vals, 50)
                            percentile75 = np.nanpercentile(vals, 75)
                            names = [
                                feat.strip() + "_" + stat
                                for stat in stats_names
                            ]
                            feats = [
                                maximum, minimum, mean, median, std, var, kurt,
                                skew, percentile25, percentile50, percentile75
                            ]
                            if startFlag:
                                for n in names:
                                    mm_names.append(n)
                            for f in feats:
                                mm_feats.append(f)
                            break
                if startFlag:
                    matrix = [mm_names + ["Class"]]
                    startFlag = False
                matrix.append(mm_feats + [className])
        am.create_arff(matrix, classes, processedDataFolder, category,
                       category)
        print("Analysis of %s acquired." % (category))
        with open(os.path.join(processedDataFolder, "%s.txt" % (category)),
                  "w+") as files:
            files.write("\n".join(analyzedFiles))
Exemple #4
0
def get_statistics(databaseFolder,
                   processedDataFolder=None,
                   outputFileName=None,
                   relationName=None):

    if processedDataFolder == None:
        processedDataFolder = "datasets/visual"
    if outputFileName == None:
        outputFileName = "all"
    if relationName == None:
        relationName = "all_visual"

    classes = sorted([
        f for f in os.listdir(databaseFolder)
        if os.path.isdir(os.path.join(databaseFolder, f))
        and not f.startswith('.')
    ],
                     key=lambda f: f.lower())
    stats_names = [
        'max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew',
        'percentile25', 'percentile50', 'percentile75'
    ]

    startFlag = True
    analyzedFiles = []
    for className in classes:
        files = sorted([
            f for f in os.listdir(os.path.join(databaseFolder, className))
            if os.path.isfile(os.path.join(databaseFolder, className, f))
            and not f.startswith('.') and f[-4:].lower() == ".csv"
        ],
                       key=lambda f: f.lower())
        analyzedFiles += ["%s,%s" % (file, className) for file in files]
        for feat_file in files:
            mm_feats = []
            mm_names = []
            df = pandas.read_csv(os.path.join(databaseFolder, className,
                                              feat_file),
                                 header='infer')
            feature_names = df.columns.values
            for feat in feature_names[5:]:
                # Feature vector
                vals = df[feat].values
                # Run statistics
                maximum = np.nanmax(vals)
                minimum = np.nanmin(vals)
                mean = np.nanmean(vals)
                median = np.nanmedian(vals)
                std = np.nanstd(vals)
                var = np.nanvar(vals)
                kurt = scipy.stats.kurtosis(vals)
                skew = scipy.stats.skew(vals)
                percentile25 = np.nanpercentile(vals, 25)
                percentile50 = np.nanpercentile(vals, 50)
                percentile75 = np.nanpercentile(vals, 75)
                names = [feat.strip() + "_" + stat for stat in stats_names]
                feats = [
                    maximum, minimum, mean, median, std, var, kurt, skew,
                    percentile25, percentile50, percentile75
                ]
                if startFlag:
                    for n in names:
                        mm_names.append(n)
                for f in feats:
                    mm_feats.append(f)
            if startFlag:
                matrix = [mm_names + ["Class"]]
                startFlag = False
            matrix.append(mm_feats + [className])
    am.create_arff(matrix, classes, processedDataFolder, outputFileName,
                   relationName)
    print("Analysis of all OpenFace features acquired.")
    with open(os.path.join(processedDataFolder, outputFileName + ".txt"),
              "w+") as files:
        files.write("\n".join(analyzedFiles))
Exemple #5
0
def get_statistics_per_category(databaseFolder, processedDataFolder=None):

    if processedDataFolder == None:
        processedDataFolder = "datasets/acousticic"

    classes = sorted([f for f in os.listdir(databaseFolder)
                      if os.path.isdir(os.path.join(databaseFolder, f)) and not f.startswith('.')],
                     key=lambda f: f.lower())
    stats_names = ['max', 'min', 'mean', 'median', 'std', 'var', 'kurt', 'skew', 'percentile25', 'percentile50',
                   'percentile75']

    categoryDictionary = {"voice": ["f0", "vuv"],
                          "glottal_flow": ["naq", "qoq", "h1h2", "psp", "mdq", "peakslope", "rd", "creak"],
                          "mcep": ["mcep_"],
                          "hmpdm": ["hmpdm_"],
                          "hmpdd": ["hmpdd_"],
                          }
    for category in categoryDictionary.keys():
        startFlag = True
        analyzedFiles = []
        for className in classes:
            files = sorted([f for f in os.listdir(os.path.join(databaseFolder, className))
                          if os.path.isfile(os.path.join(databaseFolder, className, f)) and not f.startswith('.')
                            and f[-4:].lower() == ".csv"], key=lambda f: f.lower())
            analyzedFiles += ["%s,%s" % (file, className) for file in files]
            for feat_file in files:
                mm_feats = []
                mm_names = []
                df = pandas.read_csv(os.path.join(databaseFolder, className, feat_file), header='infer')
                feature_names = df.columns.values
                for feat in feature_names:
                    reference = categoryDictionary.get(category)
                    for string in reference:
                        if feat.strip().lower().startswith(string) \
                                or feat.strip().lower().endswith(string):
                            # Feature vector
                            vals = df[feat].values
                            # Run statistics
                            maximum = np.nanmax(vals)
                            minimum = np.nanmin(vals)
                            mean = np.nanmean(vals)
                            median = np.nanmedian(vals)
                            std = np.nanstd(vals)
                            var = np.nanvar(vals)
                            kurt = scipy.stats.kurtosis(vals)
                            skew = scipy.stats.skew(vals)
                            percentile25 = np.nanpercentile(vals, 25)
                            percentile50 = np.nanpercentile(vals, 50)
                            percentile75 = np.nanpercentile(vals, 75)
                            names = [feat.strip() + "_" + stat for stat in stats_names]
                            feats = [maximum, minimum, mean, median, std, var, kurt, skew, percentile25, percentile50, percentile75]
                            if startFlag:
                                for n in names:
                                    mm_names.append(n)
                            for f in feats:
                                if np.isinf(f):
                                    mm_feats.append(np.sign(f))
                                elif np.isnan(f):
                                    mm_feats.append(0)
                                else:
                                    mm_feats.append(f)
                            break
                if startFlag:
                    matrix = [mm_names + ["Class"]]
                    startFlag = False
                matrix.append(mm_feats + [className])
        am.create_arff(matrix,classes,processedDataFolder,category,category)
        print("Analysis of %s acquired." % (category))
        with open(os.path.join(processedDataFolder, "%s.txt"%(category)), "w+") as files:
            files.write("\n".join(analyzedFiles))