def main(filename, numberOfComponents): df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig( subdir) if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) prints.printEmptyLine() pca = analysis.pca(df, numberOfComponents, relevantColumns, labelNames) prints.printExplainedVarianceRatio(pca)
def main(filename): df = utilities.readDataFile(filename) df = utilities.getDataWithTimeIndex(df) df = df.dropna() subdir = filename.split('/')[-2] columns, relevantColumns, labelNames, columnUnits, timestamps = getConfig( subdir) if relevantColumns is not None: df = utilities.dropIrrelevantColumns(df, [relevantColumns, labelNames]) prints.printEmptyLine() covMat = analysis.correlationMatrix(df) prints.printCorrelationMatrix(covMat, df, labelNames)
def dropIrrelevantColumns(df, args): relevantColumns, columnDescriptions = args print("Columns before removal: ") prints.printColumns(df, columnDescriptions) dfcolumns = df.columns for column in dfcolumns: if column not in relevantColumns: df = df.drop(column, axis=1) prints.printEmptyLine() print("Columns after removal: ") prints.printColumns(df, columnDescriptions) prints.printEmptyLine() return df
df_iris = pd.read_csv(filename).drop(column, axis=1) print("Writing file {}".format(target_file)) df_iris.to_csv(target_file, index=False) pyName = "dropColumn.py" arguments = [ "- filename (string)", "- target filename (string)", "- name of column (string)", ] # usage: python dropColumn.py file targetfile column if __name__ == "__main__": start_time = time.time() prints.printEmptyLine() print("Running", pyName) print("Prints dataframe") prints.printEmptyLine() try: filename = sys.argv[1] target_file = sys.argv[2] column = sys.argv[3] except IndexError: print(pyName, "was called with inappropriate arguments") print("Please provide the following arguments:") for argument in arguments: print(argument) sys.exit()
def getDataByTimeframe(df, start, end): print("Finding data between", start, "and", end) df = df.loc[start:end] print("Found " + str(df.shape[0]) + " rows") prints.printEmptyLine() return df