def __init__(self, downsampling_step, sequence_length): loading_dataset_since = time() extension = 'xlsx' self.downsampling_step = downsampling_step self.sequence_length = sequence_length all_filenames = [i for i in glob.glob('*.{}'.format(extension)) ] #find all files data_pd = pd.concat( [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames], ignore_index=True) #concat all the data data_numpy = data_pd.to_numpy().astype(float) zeros_removed = remove_zeros(data_numpy) downsampled_data = downsample(zeros_removed, downsampling_step) time_series_data = split_time_series(downsampled_data, sequence_length) sc = StandardScaler() scaled_data = sc.fit_transform(time_series_data) scaled_data_tensor = torch.from_numpy(scaled_data) scaled_data_tensor_reshaped = scaled_data_tensor.unsqueeze( 0).transpose(1, 0) self.len = scaled_data_tensor_reshaped.shape[0] self.training_data_tensor = scaled_data_tensor_reshaped loading_dataset_end = time() hours, minutes, seconds = timer(loading_dataset_since, loading_dataset_end) print('The length of the dataset is {}'.format( len(self.training_data_tensor))) print("Time taken {:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds))
def cluster_edges(input_dir, output_file): file_names = os.listdir(input_dir) edges = [] for file_name in file_names: edge = load_edge(file_name) if edge.shape[0] == 0: continue axis_aligned = helpers.axis_align_pandas(edge.sort_values(by='x')) edges.append(axis_aligned) downsampled_edges = [] for edge in edges: downsampled_edges.append( helpers.downsample(helpers.axis_align_pandas(edge), 1000)) total_frame_cols = get_cols_from_frame(downsampled_edges[0]) total_frame = pd.DataFrame(columns=total_frame_cols) for edge in downsampled_edges: total_frame = total_frame.append(frame_to_row(edge), ignore_index=True) kmeans = KMeans(N_CLUSTERS, n_jobs=-1) kmeans.fit(total_frame) clusters = kmeans.predict(total_frame) pd.DataFrame({ 'filename': file_names, 'cluster': clusters }).to_csv(output_file, index=False)
def find_flats(input_dir): # Takes a directory # Finds lines which are probably flat file_names = os.listdir(input_dir) edges = [] for file_name in file_names: edge = helpers.axis_align_pandas( load_edge(file_name).sort_values(by='x')) edges.append(edge) downsampled_edges = [] for edge in edges: downsampled_edges.append( helpers.downsample(helpers.axis_align_pandas(edge), 1000)) total_frame_cols = get_cols_from_frame(downsampled_edges[0]) total_frame = pd.DataFrame(columns=total_frame_cols) for edge in downsampled_edges: total_frame = total_frame.append(frame_to_row(edge), ignore_index=True) kmeans = KMeans(N_CLUSTERS) kmeans.fit(total_frame) clusters = kmeans.predict(total_frame) pd.DataFrame({ 'filename': file_names, 'cluster': clusters }).to_csv(output_file, index=False)
def send_spectrogram_update(self, spec, canvas_id=None): spec = downsample(spec) spec = astype(spec) nblocks, nfreqs = spec.shape print "spec_update:::shape:", spec.shape, "ch:", canvas_id self.send_message('spectrogram', {'action': 'update', 'nblocks': nblocks, 'nfreqs': nfreqs, 'canvasId': canvas_id}, spec.tostring())
def send_spectrogram_update(self, spec, canvas_id=None): spec = downsample(spec) spec = astype(spec) nblocks, nfreqs = spec.shape print "spec_update:::shape:", spec.shape, "ch:", canvas_id self.send_message( 'spectrogram', { 'action': 'update', 'nblocks': nblocks, 'nfreqs': nfreqs, 'canvasId': canvas_id }, spec.tostring())
def seq_query(): seq_hash = str(request.args["seq_hash"]) method = request.args["method"] # takes a seq hash and returns a downsampled region logging.debug(f"Getting data for seq ID {seq_hash}") if LOCAL: df = pd.read_parquet(f"data/{seq_hash}.{method}.parquet.sz") else: df = query_x_range(f"{seq_hash}.{method}.parquet.sz", request.args.get("x_min"), request.args.get("x_max")) logging.debug("Got the data") zone = df.loc[(float(request.args.get("x_max", df.x.max())) >= df.x) & ( float(request.args.get("x_min", df.x.min())) <= df.x)].values.tolist() return jsonify((seq_hash, downsample(zone)))
def transform_route(): sequence = request.form["seq"] seq_name = request.form["seq_name"] method = request.form["method"] logging.debug("Hashing seq") seq_hash = str(xxhash.xxh64(sequence).intdigest()) if LOCAL: exists = os.path.exists(f"data/{seq_hash}.{method}.parquet.sz") logging.debug(f"Found {seq_hash} locally") else: exists = exists_on_s3(f"{seq_hash}.{method}.parquet.sz") logging.debug(f"Found {seq_hash} on S3") if exists: if LOCAL: df = pd.read_parquet(f"data/{seq_hash}.{method}.parquet.sz") else: df = query_x_range(f"{seq_hash}.{method}.parquet.sz") else: logging.debug( f"No previous transformation for {seq_name} found. Transforming..." ) transformed = transform(sequence, method=method) logging.debug("Saving transformed data for " + seq_name) df = pd.DataFrame(dict(x=transformed[0], y=transformed[1])) df.to_parquet(f"data/{seq_hash}.{method}.parquet.sz") if not LOCAL: logging.debug(f"Uploading {seq_hash} to S3") upload(f"{seq_hash}.{method}.parquet.sz") logging.debug(f"Got the overview data for {seq_hash}") zone = df.values.tolist() return jsonify((seq_hash, downsample(zone)))
def __init__(self, downsampling_step, sequence_length, train=True, normalize=False): loading_dataset_since = time() extension = 'xlsx' self.downsampling_step = downsampling_step self.sequence_length = sequence_length #find all files and concatenate all_filenames = [i for i in glob.glob('*{}'.format(extension))] data = pd.concat( [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames], ignore_index=True) #extract torque and label torque = data.iloc[:, 0].to_numpy().astype(float) label = data.iloc[:, 1].to_numpy().astype(float) #remove zeros from torque and label label = np.delete(label, np.where(torque == 0)) torque = remove_zeros(torque) #expand dimension and store the zero removed data torque = np.expand_dims(torque, axis=1) label = np.expand_dims(label, axis=1) data = np.append(torque, label, axis=1) #find the normal and anomalous labeled sequences and divide the data into segments' segmented_list = consecutive( (np.where(data[:, 1] == 0))[0]) + consecutive( (np.where(data[:, 1] == 1))[0]) segmented_list.sort(key=lambda segmented_list: segmented_list[1]) segmented_data = [] for i in range(len(segmented_list)): segments = segmented_list[i] start_index = segments[0] end_index = segments[len(segments) - 1] seg_data = data[start_index:end_index + 1, :] segmented_data.append(seg_data) #downsample the data and make sequences' sequenced_data = [] for i in range(len(segmented_data)): label = segmented_data[i][0, 1] data = downsample(segmented_data[i][:, 0], self.downsampling_step) data = split_time_series(data, self.sequence_length) if label == 0.: label_column = [0] * len(data) else: label_column = [1] * len(data) sequenced_data.append(np.column_stack((data, label_column))) data = np.empty((0, self.sequence_length + 1)) for i in range(len(sequenced_data)): if sequenced_data[i].shape[1] == self.sequence_length + 1: data = np.append(data, sequenced_data[i], axis=0) if normalize: #scale the data and return the tensor output' sc = StandardScaler() training_data = data[0:int(0.7 * (len(data))), 0:self.sequence_length] testing_data = data[int(0.7 * (len(data))):, 0:self.sequence_length] training_label = data[0:int(0.7 * (len(data))), -1] testing_label = data[int(0.7 * (len(data))):, -1] sc_fit = sc.fit(training_data) if train: unlabeled_data = sc_fit.transform(training_data) data = np.column_stack((unlabeled_data, training_label)) else: unlabeled_data = sc_fit.transform(testing_data) data = np.column_stack((unlabeled_data, testing_label)) else: if train: data = data[0:int(0.7 * (len(data))), :] else: data = data[int(0.7 * (len(data))):, :] data = torch.from_numpy(data).unsqueeze(0).transpose(1, 0) self.len = data.shape[0] self.data = data loading_dataset_end = time() hours, minutes, seconds = timer(loading_dataset_since, loading_dataset_end) print('The length of the dataset is {}'.format(self.len)) print("Time taken {:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds))
def __init__(self, sequence_length, downsampling_step=10, train=True, normalize=True): self.sequence_length = sequence_length self.downsampling_step = downsampling_step self.train = train self.normalize = normalize #load the data if self.train: #fault free training data load = pyreadr.read_r( '.\TE_Data_full\TEP_FaultFree_Training.RData') load = load['fault_free_training'] temp_data = np.asarray(load, dtype=np.float32)[:, 3:] #temp_label = np.asarray(load, dtype=np.int32)[:,0] if self.normalize: sc = StandardScaler() temp_data = sc.fit_transform(temp_data) fault_free_training = temp_data del (temp_data) #make sequences sequenced_data_list = [] for variables in fault_free_training.T: temp_data_d = downsample(variables, self.downsampling_step) temp_data = split_time_series(temp_data_d, self.sequence_length) temp_data = np.expand_dims(temp_data, axis=1) sequenced_data_list.append(temp_data) fault_free_training = torch.empty( ((sequenced_data_list[0].shape[0]), 0, self.sequence_length)) for sequences in sequenced_data_list: fault_free_training = torch.cat( (fault_free_training, torch.from_numpy(sequences)), dim=1) fault_free_label = np.zeros(fault_free_training.shape[0], dtype=np.int32) del (sequenced_data_list) del (temp_data) #faulty training data load = pyreadr.read_r('.\TE_Data_full\TEP_Faulty_Training.RData') load = load['faulty_training'] temp_data = np.asarray(load, dtype=np.float32)[:, 3:] temp_label = np.asarray(load, dtype=np.int32)[:, 0] if self.normalize: temp_data = sc.fit_transform(temp_data) faulty_training = temp_data del (temp_data) #make sequences sequenced_data_list = [] for variables in faulty_training.T: temp_data_d = downsample(variables, self.downsampling_step) temp_data = split_time_series(temp_data_d, self.sequence_length) temp_data = np.expand_dims(temp_data, axis=1) sequenced_data_list.append(temp_data) faulty_training = torch.empty( (sequenced_data_list[0].shape[0], 0, self.sequence_length)) for sequences in sequenced_data_list: faulty_training = torch.cat( (faulty_training, torch.from_numpy(sequences)), dim=1) faulty_label = np.ones(faulty_training.shape[0], dtype=np.int32) del (sequenced_data_list) del (temp_data) training_data = torch.cat((fault_free_training, faulty_training), dim=0) training_label = np.concatenate((fault_free_label, faulty_label), axis=0) self.data = training_data self.label = torch.from_numpy(training_label) self.len = len(self.data) print('The length of the dataset is {}'.format(self.len)) else: #fault free testing data load = pyreadr.read_r('.\TE_Data_full\TEP_FaultFree_Testing.RData') load = load['fault_free_testing'] temp_data = np.asarray(load, dtype=np.float32)[:, 3:] #temp_label = np.asarray(load, dtype=np.int32)[:,0] if self.normalize: sc = StandardScaler() temp_data = sc.fit_transform(temp_data) fault_free_testing = temp_data del (temp_data) #make sequences sequenced_data_list = [] for variables in fault_free_testing.T: temp_data_d = downsample(variables, self.downsampling_step) temp_data = split_time_series(temp_data_d, self.sequence_length) temp_data = np.expand_dims(temp_data, axis=1) sequenced_data_list.append(temp_data) fault_free_testing = torch.empty( ((sequenced_data_list[0].shape[0]), 0, self.sequence_length)) for sequences in sequenced_data_list: fault_free_testing = torch.cat( (fault_free_testing, torch.from_numpy(sequences)), dim=1) fault_free_label = np.zeros(fault_free_testing.shape[0], dtype=np.int32) del (sequenced_data_list) del (temp_data) #faulty testing data load = pyreadr.read_r('.\TE_Data_full\TEP_Faulty_Testing.RData') load = load['faulty_testing'] temp_data = np.asarray(load, dtype=np.float32)[:, 3:] #temp_label = np.asarray(load, dtype=np.int32)[:,0] if self.normalize: sc = StandardScaler() temp_data = sc.fit_transform(temp_data) faulty_testing = temp_data del (temp_data) #make sequences sequenced_data_list = [] for variables in faulty_testing.T: temp_data_d = downsample(variables, self.downsampling_step) temp_data = split_time_series(temp_data_d, self.sequence_length) temp_data = np.expand_dims(temp_data, axis=1) sequenced_data_list.append(temp_data) faulty_testing = torch.empty( (sequenced_data_list[0].shape[0], 0, self.sequence_length)) for sequences in sequenced_data_list: faulty_testing = torch.cat( (faulty_testing, torch.from_numpy(sequences)), dim=1) faulty_label = np.ones(faulty_testing.shape[0], dtype=np.int32) del (sequenced_data_list) del (temp_data) testing_data = torch.cat((fault_free_testing, faulty_testing), dim=0) testing_label = np.concatenate((fault_free_label, faulty_label), axis=0) self.data = testing_data self.label = torch.from_numpy(testing_label) self.len = len(self.data) print('The length of the dataset is {}'.format(self.len))