def select_complex(self, out_table, clause, in_table="base"): in_df = self.table_store[in_table].df() # type:pd.DataFrame c = self.table_store[in_table].centroids out_df = None if clause is None: self.table_store[out_table] = DataTable(df=in_df) return once = True for i in clause: tmp_df = in_df for j in clause[i]: print(i, j, clause[i][j]) equation = clause[i][j] val = float(equation["val"]) param = equation["param"] if equation["comp"] == "<": tmp_df = tmp_df.loc[tmp_df[param] < val] elif equation["comp"] == "<=": tmp_df = tmp_df.loc[tmp_df[param] <= val] elif equation["comp"] == "=": tmp_df = tmp_df.loc[tmp_df[param] == val] elif equation["comp"] == "=>": tmp_df = tmp_df.loc[tmp_df[param] >= val] elif equation["comp"] == ">": tmp_df = tmp_df.loc[tmp_df[param] > val] if once: once = False out_df = tmp_df else: out_df = tmp_df.combine_first(out_df) if once: #if no clauses were within the dict out_df = in_df self.table_store[out_table] = DataTable(df=out_df, centroids=c)
def aggregate(self, out_table, mode, limit=0, in_table="base"): df = self.table_store[in_table].df() #type:pd.DataFrame c = self.table_store[in_table].centroids #new_df = pd.DataFrame(columns=attr_names) ## only view selected columns data = dict() ## dictionary for results ## aggregate all rows in one step if limit is 0 if limit is 0: limit = len(df.index) ## iterate over columns for attr, values in df.iteritems(): data[attr] = list() i = 0 if attr == TIME_ATTR: for value in values: if i % limit == 0 and mode is "MIN": data[attr].append(value) i += 1 if i % limit == 0 and mode is not "MIN": data[attr].append(value) else: ## track data characteristics for result sum = 0 min = 0 max = 0 ## count how many non-missing values in interval valid_cnt = 0 for value in values: ## update min, max and sum for each value if i % limit == 0 or np.isnan(min): min = value max = value sum = 0 valid_cnt = 0 else: if not np.isnan(value): if value < min: min = value if value > max: max = value if not np.isnan(value): valid_cnt += 1 sum = sum + value i = i + 1 ## produce new result row if i%limit==0 if i % limit is 0: insval = np.nan if valid_cnt > 0: if mode is "AVG": insval = sum / valid_cnt elif mode is "MAX": insval = max elif mode is "MIN": insval = min data[attr].append(insval) #new_df.insert(int(i / limit), attr, insval) self.table_store[out_table] = DataTable(df=pd.DataFrame(data), centroids=c)
def project(self, out_table, attr1, attr2, in_table="base"): df = self.table_store[in_table].df() ## narrow data to the specified attributes df = df[[attr1, attr2]] ## store results in new table self.table_store[out_table] = DataTable(df=df)
def select(self, out_table, attr_name, a=None, b=None, in_table="base"): df = self.table_store[in_table].df() c = self.table_store[in_table].centroids ## select the according tuples for these boundaries ## for the given attribute if a is not None: df = df.loc[df[attr_name] >= a] if b is not None: df = df.loc[df[attr_name] <= b] ## store results in new table self.table_store[out_table] = DataTable(df=df, centroids=c)
def cluster(self, out_table, k, params, in_table="base"): df = self.table_store[in_table].df() # type:pd.DataFrame ##TMP #check for null values and remove them #(retain only not-null-values for relevant params) for p in params: df = df.loc[df[p].notnull()] kmeans = cluster.KMeans(n_clusters=k) kmeans.fit(df[params]) centroids = kmeans.cluster_centers_ df["_label"] = kmeans.labels_ self.table_store[out_table] = DataTable(df=df, centroids=centroids)
def normalize(self, out_table, params=None, in_table="base"): if params is None: params = self.get_grain_columns() df = self.table_store[in_table].df() # type:pd.DataFrame c = self.table_store[in_table].centroids other_params = [col for col in df.columns if col not in params] #temporarily save unconcerned columns out_df = df[other_params] #normalize all other columns df = df[params] df = df.div(df.sum(axis=1), axis=0) #put them back together out_df = out_df.join(df) self.table_store[out_table] = DataTable(df=out_df, centroids=c)
def newcols(self, out_table, mode, newcols, in_table="base"): df = self.table_store[in_table].df().copy() # type:pd.DataFrame c = self.table_store[in_table].centroids if mode == "SUM": for new_colname, paramlist in newcols.items(): df[new_colname] = df[paramlist].sum(axis=1) elif mode == "MEAN": for new_colname, paramlist in newcols.items(): df[new_colname] = df[paramlist].mean(axis=1) elif mode == "MIN": for new_colname, paramlist in newcols.items(): df[new_colname] = df[paramlist].min(axis=1) elif mode == "MAX": for new_colname, paramlist in newcols.items(): df[new_colname] = df[paramlist].max(axis=1) self.table_store[out_table] = DataTable(df=df, centroids=c)
def aggregateTime(self, out_table, mode, minutes=60 * 24, in_table='base'): df = self.table_store[in_table].df() #type:pd.DataFrame c = self.table_store[in_table].centroids freq = '{}Min'.format(minutes) #df.index = pd.DatetimeIndex(df[TIME_ATTR], copy=True) by = pd.TimeGrouper(freq=freq) grouped = df.groupby(by) if mode == 'COUNT': out = grouped.count() elif mode == 'AVG': out = grouped.mean() elif mode == 'MIN': out = grouped.min() else: out = grouped.max() self.table_store[out_table] = DataTable(df=out, centroids=c)
def groupby2(self, out_table, attr, mode, in_table="base", bydate=False): df = self.table_store[in_table].df() #type:pd.DataFrame c = self.table_store[in_table].centroids if bydate: by = df[attr].dt.normalize() else: by = df[attr] grouped = df.groupby(by) if mode is 'COUNT': out = grouped.count() elif mode is 'SUM': out = grouped.sum() else: out = grouped.max() #data.groupby().count() ## narrow data to the specified attributes #df = df[[attr1, attr2]] ## store results in new table self.table_store[out_table] = DataTable(df=out, centroids=c)
def link(self, out_table, in_table="base"): df = self.table_store[in_table].df() c = self.table_store[in_table].centroids ## store 'copy' in new table self.table_store[out_table] = DataTable(df=df, centroids=c)
def read_data(self, path): self.table_store["base"] = DataTable(path)
def select_ids(self, out_table, ids, in_table="base"): df = self.table_store[in_table].df() # type:pd.DataFrame c = self.table_store[in_table].centroids df = df.loc[ids] self.table_store[out_table] = DataTable(df=df, centroids=c)
def store_df(self, df, name): self.table_store[name] = DataTable(df=df)
def main(argv): # setup options from command line netType = 'ResNet' input_shape1 = 224 input_shape2 = 224 gaus = 0 numberOfEpochs = 30 hiddenUnits = 75 patients_to_use = 'ALL' weights = '' folder = "D:\\deep learning dataset\\MS Fall Study" try: opts, args = getopt.getopt(argv, "?f:t:1:2:g:e:h:p:w:") except getopt.GetoptError: Help() return for opt, arg in opts: if opt == '-?': Help() return elif opt == '-f': folder = arg elif opt == '-w': weights = arg elif opt == '-t': netType = arg elif opt == '-1': input_shape1 = int(arg) elif opt == '-2': input_shape2 = int(arg) elif opt == '-g': gaus = float(arg) elif opt == '-e': numberOfEpochs = int(arg) elif opt == '-h': hiddenUnits = int(arg) elif opt == '-p': try: patients_to_use = int(arg) except ValueError: patients_to_use = arg if 'Plt' == netType: vis = Visualize(folder, patients_to_use, True) vis.run() elif 'Img' == netType: vis = Visualize(folder, patients_to_use, False) vis.run() elif 'Table' == netType: table = DataTable(folder) table.run() else: netTypeVal = GetType(netType) if NETTYPE_INVALID == netTypeVal: print('unknown tpye:', netType) return print( '##################################################################################' ) print( '# {0:s} shape ({1:d}, {2:d}) epochs {3:d} gaus {4:f} hidden units {5:d} #' .format(netType, input_shape1, input_shape2, numberOfEpochs, gaus, hiddenUnits)) print( '##################################################################################' ) rgb = True twoD = False batchSize = 32 input_shape = (input_shape1, input_shape2) activities_to_load = [ "30s Chair Stand Test", "Tandem Balance Assessment", "Standing Balance Assessment", "Standing Balance Eyes Closed", "ADL: Normal Walking", "ADL: Normal Standing", "ADL: Normal Sitting", "ADL: Slouch sitting", "ADL: Lying on back", "ADL: Lying on left side", "ADL: Lying on right side" ] preLoader = MatrixPreLoader(dataset_directory=folder, patients_to_use=patients_to_use, activity_types=activities_to_load, print_loading_progress=True) num_features = preLoader.get_number_of_patients() if NETTYPE_VGGBN == netTypeVal: vgg = VGG16Imp() model = vgg.VGG16WithBN(input_shape=(input_shape1, input_shape2, 3), classes=num_features) elif NETTYPE_VGG16 == netTypeVal: model = VGG16(weights=None, classes=num_features, input_shape=(input_shape1, input_shape2, 3)) elif NETTYPE_RESNET == netTypeVal: resnet = ResNetImp() model = resnet.ResNet((input_shape1, input_shape2, 3), num_features) elif NETTYPE_RESNETPT == netTypeVal or NETTYPE_RESNETPD == netTypeVal: resnet = ResNetImp() model = resnet.ResNetP(weights, (input_shape1, input_shape2, 3), num_features) elif NETTYPE_SIMPLE == netTypeVal: rgb = False twoD = True rnn = RNNImp(hiddenUnits) model = rnn.SimpleRNN(input_shape, num_features) elif NETTYPE_GRU == netTypeVal: rgb = False twoD = True rnn = RNNImp(hiddenUnits) model = rnn.GRU(input_shape, num_features) elif NETTYPE_LSTM == netTypeVal: rgb = False twoD = True rnn = RNNImp(hiddenUnits) model = rnn.LSTM(input_shape, num_features) else: return print("create_generators") training_gen, validation_gen = create_generators( preLoader, input_shape, rgb, twoD, gaus, batchSize) if NETTYPE_RESNETPD == netTypeVal: print("predict_with_generator") predict_with_generator(model, validation_gen, preLoader) else: print("train_model_with_generator") train_model_with_generator(model, training_gen, validation_gen, numberOfEpochs, netType) training_dist = training_gen.GetDistribution() validation_dist = validation_gen.GetDistribution() print('training distribution', training_dist) print('validation distribution', validation_dist)