def import_dataset_arff(f, explain_indices: List[int], random_explain_dataset: bool) -> Tuple[Dataset, Dataset, List[str]]: dataset = load_arff(f) dataset_len = len(dataset) train_indices = list(range(dataset_len)) if random_explain_dataset: random.seed(1) # small dataset MAX_SAMPLE_COUNT = 100 if dataset_len < (2 * MAX_SAMPLE_COUNT): samples = int(0.2 * dataset_len) else: samples = MAX_SAMPLE_COUNT # Randomly pick some instances to remove from the training dataset and use in the # explain dataset explain_indices = list(random.sample(train_indices, samples)) for i in explain_indices: train_indices.remove(i) train_dataset = Dataset.from_indices(train_indices, dataset) explain_dataset = Dataset.from_indices(explain_indices, dataset) return train_dataset, explain_dataset, [str(i) for i in explain_indices]
def test_equals_nequals(self): """Test (not) equals cuts""" test_df = pd.DataFrame({'testvar1': [1, 0, 1, 0, 0, 0, 1, 1, 1, 0]}) test_cut_dicts = [{ 'name': 'cut 1', 'cut_var': 'testvar1', 'relation': '=', 'cut_val': 1, 'group': 'var1cut', 'is_symmetric': True }, { 'name': 'cut 2', 'cut_var': 'testvar1', 'relation': '!=', 'cut_val': 1, 'group': 'var1cut', 'is_symmetric': False }] cut_label = config.cut_label Dataset._create_cut_columns(test_df, test_cut_dicts) out_column1 = pd.Series(data=[ True, False, True, False, False, False, True, True, True, False ], name='cut 1' + cut_label) out_column2 = pd.Series(data=[ False, True, False, True, True, True, False, False, False, True ], name='cut 2' + cut_label) assert pd.Series.equals(test_df['cut 1' + cut_label], out_column1), \ f"Expected {out_column1}, got {test_df['cut 1' + cut_label]}" assert pd.Series.equals(test_df['cut 2' + cut_label], out_column2), \ f"Expected {out_column2}, got {test_df['cut 2' + cut_label]}"
def convert(): # Load model image_shape = (224, 224) detector = Detector(image_shape, 'models') model = detector.model # Data pipeline batch_size = 64 ds = Dataset(image_shape, batch_size) pipeline, _ = ds.pipeline() def representative_dataset_gen(): for tensor in pipeline.take(1): raw_imgs, mask_imgs = tensor img = np.array([raw_imgs[0]]) yield [img] # Shape (1, height, width, channel) converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.representative_dataset = representative_dataset_gen converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] converter.inference_input_type = tf.uint8 converter.inference_output_type = tf.uint8 tflite_quant_model = converter.convert() MODEL = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../models/tpu/ohmnilabs_floornet_224_quant_postprocess.tflite') open(MODEL, 'wb').write(tflite_quant_model)
def _compute_prediction_difference_subset(training_dataset: Dataset, encoded_instance: pd.Series, rule_body_indices, clf, instance_class_index): encoded_instance_x = encoded_instance[:-1].to_numpy() rule_attributes = [ list(training_dataset.attributes())[rule_body_index - 1][0] for rule_body_index in rule_body_indices] # Take only the considered attributes from the dataset filtered_dataset = training_dataset.X()[rule_attributes] # Count how many times a set of attribute values appears in the dataset attribute_sets_occurrences = dict( Counter(map(tuple, filtered_dataset.values.tolist())).items()) # For each set of attributes differences = [ _compute_perturbed_difference(item, clf, encoded_instance, instance_class_index, rule_attributes, training_dataset) for item in attribute_sets_occurrences.items()] prediction_difference = sum(differences) # p(y=c|x) i.e. Probability that instance x belongs to class c p = clf.predict_proba(encoded_instance_x.reshape(1, -1))[0][instance_class_index] prediction_differences = p - prediction_difference return prediction_differences
def setUp(self): name = "cassandra20200615" mode = "train" repositories = [{ "name": "cassandra20200615", "url": "", "CommitTarget": "", "filterFile": "", "codeIssueJira": "", "projectJira": "" }] parameters = {} option = { "name": name, "mode": mode, "repositories": repositories, "parameters": parameters #needless when to infer. } option = Option(option) self.dataset = Dataset(option.getRepositorieImproved()) self.repository = repositories[0] print( os.path.join(UtilPath.Test(), "testDataset", self.repository["name"], "repository")) self.gr = GitRepository( os.path.join(UtilPath.Test(), "testDataset", self.repository["name"], "repository"))
def train(): # Config params image_shape = (224, 224) batch_size = 64 epochs = 30 # Dataset & model detector = Detector(image_shape) ds = Dataset(image_shape, batch_size) training_pipeline, validation_pipeline = ds.pipeline() steps_per_epoch = ds.num_training//batch_size # Start training model_history = detector.train( training_pipeline, epochs, steps_per_epoch, validation_pipeline, ) # Visualize loss loss = model_history.history['loss'] val_loss = model_history.history['val_loss'] range_of_epochs = range(epochs) plt.figure() plt.plot(range_of_epochs, loss, 'r', label='Training loss') plt.plot(range_of_epochs, val_loss, 'bo', label='Validation loss') plt.title('Training Loss and Validation Loss') plt.xlabel('Epoch') plt.ylabel('Loss Value') plt.ylim([0, 1]) plt.legend() plt.show()
def main(): network_dataset = Dataset('twitters2') nl = read_file_to_dict(os.path.join(DATASET_PATH, 'TwitterSample2.txt')) # 10% sampling nbunch = nl[0:int(len(nl) // 2)] network_dataset.graph = network_dataset.graph.subgraph(nbunch) server_list = [Server(k) for k in range(0, 512)] vp_number = 0 node_list = list(network_dataset.graph.nodes) random.shuffle(node_list) print('Dataset information: TwitterSample2\nNodes Number:', network_dataset.graph.order(), '\nEdge Number:', network_dataset.graph.size()) print('Using Random Partitioning Method...\nServer Number:', len(server_list), '\nVirtual Primary Copy Number:', vp_number, '\nWrite Frequency of Nodes: 1') start = time.time() m = RandomP(server_list, network_dataset, node_list) m.add_new_primary_node(server_list, vp_number) m.check_server_load() m.check_locality() end = time.time() print('Random Partitioning Time:', end - start, 'seconds') m.compute_inter_sever_cost() path = RANDOM_GRAPH_PATH m.save_all(path)
def test_encode(filename, seq_length, text): dataset = Dataset([filename], seq_length) encoded = dataset.encode(text) assert len(encoded) == len(text) for label in encoded: assert sum(label) == 1 assert len(label) == dataset.vocab_size
def make_ct_datasets(configs, paths): TRAIN_SIZE = 0.9 o_img_paths = np.array( sorted(glob(os.path.join(paths['data']['path'], 'Original/*')))) f_img_paths = np.array( sorted(glob(os.path.join(paths['data']['path'], 'Filtered/*')))) img_paths_train = { 'original': o_img_paths[:int(TRAIN_SIZE * len(o_img_paths))], 'filtered': f_img_paths[:int(TRAIN_SIZE * len(f_img_paths))] } img_paths_val = { 'original': o_img_paths[int(TRAIN_SIZE * len(o_img_paths)):], 'filtered': f_img_paths[int(TRAIN_SIZE * len(f_img_paths)):] } crop_size = configs['data_params']['augmentation_params']['crop_size'] transforms_train = Compose([RandomCrop(crop_size), ToFloat(), ToTensor()]) transforms_val = Compose([RandomCrop(1344), ToFloat(), ToTensor()]) train_loader = DataLoader( Dataset(img_paths_train, transforms_train), batch_size=configs['data_params']['batch_size'], num_workers=configs['data_params']['num_workers'], shuffle=True) val_loader = DataLoader(Dataset(img_paths_val, transforms_val), batch_size=1, num_workers=configs['data_params']['num_workers'], shuffle=False) return train_loader, val_loader
def __init__( self, config, name: str, device=torch.device('cuda'), model_path: str = None, ): self.name = name self.config = config self.device = device self.model = Network(config).to(self.device) if model_path is not None: chckpt = torch.load(model_path) self.model.load_state_dict(chckpt) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.lr) self.writer = SummaryWriter( os.path.join(self.config.work_dir, self.name)) self.training_dataset = Dataset(dataset_type='training', config=config) self.validation_dataset = Dataset(dataset_type='validation', config=config) self.training_dataloader = torch.utils.data.DataLoader( self.training_dataset, batch_size=self.config.batch_size, shuffle=True, drop_last=True, ) self.validation_dataloader = torch.utils.data.DataLoader( self.validation_dataset, batch_size=self.config.batch_size, ) self.criterion = torch.nn.CrossEntropyLoss()
def create_algo(server_count=4, node_count=10): data = Dataset(dataset_str='facebook') data.graph = nx.Graph() for i in range(node_count): data.graph.add_node(i) server_list = [Server(serer_id=i) for i in range(server_count)] algo = OfflineAlgo(server_list=server_list, network_dataset=data) return algo
def show_predictions(): image_shape = (224, 224) detector = Detector(image_shape) ds = Dataset(image_shape) pipeline, _ = ds.pipeline() for image, mask in pipeline.take(1): pred_mask = detector.predict(image) __display([image[0], mask[0], __create_mask(pred_mask)])
def main(): args = Parser().get_parser().parse_args() print("=====Configurations=====\n", args) # Load Configuration and data config = Config(args) dataset = Dataset(config) start = time.time() outer_tracking = {} # TODO Load data once across all folds headers = ['O_EPOCH', 'I_EPOCH', 'TR_F1', 'VAL_LOSS', 'VAL_F1', 'k-MICRO-F1', 'k-MACRO-F1', 'MICRO-F1', 'MACRO-F1', 'MC_ACC', 'ML_ACC', 'BAE'] perc_results = [[]]*len(config.train_percents) for perc_id, train_percent in enumerate(config.train_percents): print('\n\n############################ Percentage: ', train_percent, '#####################################') # config.train_percent = train_percent fold_results = [[]]*len(config.train_folds) for fold_id, fold in enumerate(config.train_folds): print('\n------- Fold: ', fold) # config.train_fold = fold dataset.load_indexes(train_percent, fold) values = train_model(dataset) if config.prop_model_name == 'propagation_gated': np.save(path.join(config.paths['experiment'], config.dataset_name + '-' + str(fold) + '-' + str(config.max_depth) + '_gating_scores.npy'), scores) outer_tracking[fold_id] = values fold_results[fold_id] = values[-1] if not config.save_model: remove_directory(config.paths['perc_' + train_percent] + '_' + fold) fold_results = np.vstack(fold_results) file_name = os.path.join(config.paths['perc_' + train_percent], 'metrics.txt') np.savetxt(file_name, fold_results, header=str(headers), comments='', fmt='%1.5f') perc_results[perc_id] = np.mean(fold_results, axis=0) if not config.save_model: remove_directory(config.paths['perc_' + train_percent]) results = np.vstack(perc_results) file_name = os.path.join(config.paths['experiment'], 'metrics.txt') np.savetxt(file_name, results, header=str(headers), comments='', fmt='%1.5f') print('Mico: ', results[0][8], '| Macro: ', results[0][9]) np.save(path.join(config.paths['experiment'], config.dataset_name+str(config.max_depth)+'_batch_results.npy'), outer_tracking) # TODO code inference - Load model and run test print('Time taken:', time.time() - start)
def test_sample(filename, batch_size, seq_length): dataset = Dataset([filename], seq_length) count = 0 batch = dataset.sample(batch_size) for seq in batch.inputs: assert len(seq) == seq_length for i in range(seq_length): # One-hot encoded assert sum(seq[i]) == 1 assert len(seq[i]) == dataset.vocab_size count += 1 assert count == batch_size
def test_batch(filename, batch_size, seq_length): dataset = Dataset([filename], seq_length) for batch in dataset.batch(batch_size): # The number of elements in the batch is `batch_size` assert len(batch.inputs) == batch_size assert len(batch.targets) == batch_size for i in range(batch_size): # Each element in the batch is a sequence assert len(batch.inputs[i]) == seq_length assert len(batch.targets[i]) == seq_length for j in range(seq_length): # One-hot encoded assert sum(batch.inputs[i][j]) == 1 assert len(batch.inputs[i][j]) == dataset.vocab_size
def test_derived_variable(self, tmp_root_datafile): derived_vars = { 'dev_var1': { 'var_args': ['testvar1', 'testvar2'], 'tree': 'tree1', 'func': lambda x, y: x + y }, 'dev_var2': { 'var_args': ['testvar4'], 'tree': 'tree2', 'func': lambda x: 2 * x } } vars_to_cut = self.test_vars_to_cut.copy() | {'dev_var1', 'dev_var2'} expected_output = self.expected_output.copy() expected_output['testvar2'] = np.arange(1000) * 1.1 expected_output['testvar4'] = np.arange(1000) * -1 expected_output['dev_var1'] = expected_output[ 'testvar1'] + expected_output['testvar2'] expected_output['dev_var2'] = 2 * expected_output['testvar4'] output = Dataset._build_dataframe(tmp_root_datafile, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=vars_to_cut, calc_vars_dict=derived_vars) # test column names are the same assert set(output.columns) == set(expected_output.columns) # test contents are the same for col in output.columns: assert np.array_equal(output[col], expected_output[col])
def load_data(): # read training_df = pd.read_csv(os.path.join(DEFAULT_DATA_FOLDER, "training"), sep="\t", dtype={ "user_id": str, "item_id": str }) test_df = pd.read_csv(os.path.join(DEFAULT_DATA_FOLDER, "test"), sep="\t", dtype={ "user_id": str, "item_id": str }) item_info_long = pd.read_csv(os.path.join(DEFAULT_DATA_FOLDER, "item_features"), sep="\t", dtype={"item_id": str}) item_info_wide = item_info_long.pivot( index="item_id", columns="feature", values="value").reset_index().fillna(0) # y_train = training_df.rating.values.astype(np.float) training_df = training_df.drop(columns=["rating"]) y_test = test_df.rating.values.astype(np.float) test_df = test_df.drop(columns=["rating"]) return Dataset(training_df, y_train, test_df, y_test, item_info_wide)
def test_EncodedDataset_constructor(self): dataset = ch.datasets.TupleDataset([ Entry("entry1", [Example(([10, 20, 30], ), 10)], dict([["HEAD", True], ["SORT", False]])), Entry("entry2", [Example(([30, 20, 10], ), [10, 20, 30])], dict([["HEAD", False], ["SORT", True]])) ]) cdataset = EncodedDataset( Dataset(dataset, DatasetMetadata(1, set(["HEAD", "SORT"]), 256, 5))) [(types0, values0, attribute0), (types1, values1, attribute1)] = list(cdataset) self.assertTrue(np.all([[[0, 1], [1, 0]]] == types0)) self.assertTrue( np.all([[[266, 276, 286, 512, 512], [266, 512, 512, 512, 512]]] == values0)) self.assertTrue(np.all(np.array([1, 0]) == attribute0)) self.assertTrue(np.all([[[0, 1], [0, 1]]] == types1)) self.assertTrue( np.all([[[286, 276, 266, 512, 512], [266, 276, 286, 512, 512]]] == values1)) self.assertTrue(np.all(np.array([0, 1]) == attribute1))
def __init__(self, raw_dataframe, data_config): self.raw = raw_dataframe if "test_ratio" in data_config.keys( ) and data_config.test_ratio is not None: self.train_test_split = True train_data, test_data = train_test_split( self.raw, test_size=data_config.test_ratio, random_state=0, stratify=self.raw[["label"]]) self.train = Dataset(train_data) self.test = Dataset(test_data) else: self.train_test_split = False train_data = self.raw self.train = pd.DataFrame(raw_dataframe)
def test_load(filename, start_seq): seq_length = 25 dataset = Dataset([filename], seq_length) model = RNNTextGenerator(25, dataset.vocab_size, meta_graph='./model/RNNTextGenerator') print(model.generate(dataset, start_seq, 50))
def test_alt_trees(self, tmp_root_datafile): newcut = { 'name': 'cut 3', 'cut_var': 'testvar4', 'relation': '<', 'cut_val': -10, 'group': 'var4cut', 'is_symmetric': False, 'tree': 'tree2' } list_of_dicts = self.test_cut_dicts.copy() list_of_dicts += [newcut] expected_output = self.expected_output.copy() expected_output['testvar4'] = np.arange(1000) * -1 expected_output['eventNumber'] = np.arange(1000) output = Dataset._build_dataframe(tmp_root_datafile, TTree_name=self.default_TTree, cut_list_dicts=list_of_dicts, vars_to_cut=self.test_vars_to_cut) assert set(output.columns) == set(expected_output.columns) # test contents are the same for col in output.columns: assert np.array_equal(output[col], expected_output[col]), \ f"Dataframe builder failed in column {col};\n" \ f"Expected: \n{expected_output[col]},\n" \ f"Got: \n{output[col]}"
def view_samples(): image_shape = (224, 224) ds = Dataset(image_shape) pl, _ = ds.pipeline() num_of_samples = 5 for raw_imgs, mask_imgs in pl.take(1): samples = zip(raw_imgs[:num_of_samples], mask_imgs[:num_of_samples]) samples = list(samples) length = len(samples) plt.figure(figsize=(5, 5 * length)) for i, (raw_img, mask_img) in enumerate(samples): plt.subplot(length, 2, 2 * i + 1) plt.imshow(raw_img) plt.subplot(length, 2, 2 * i + 2) mask_img = np.reshape(mask_img, image_shape) plt.imshow(mask_img) plt.show()
def test_missing_branch(self, tmp_root_datafile): missing_branches = {'missing1', 'missing2'} with pytest.raises(ValueError) as e: _ = Dataset._build_dataframe(tmp_root_datafile, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=missing_branches) assert e.match(r"Missing TBranch\(es\) .* in TTree 'tree1' of file .*")
def test_relocate_process(self): data = Dataset(dataset_str='facebook') data.graph = nx.Graph() for i in range(10): data.graph.add_node(i) data.graph.add_edge(0, 1) data.graph.add_edge(0, 2) data.graph.add_edge(0, 3) data.graph.add_edge(0, 4) server_list = [Server(serer_id=i) for i in range(8)] algo = OfflineAlgo(server_list=server_list, network_dataset=data) node_list = list(data.graph.nodes) node_len = len(node_list) for i in range(node_len): n = node_list[i] algo.add_new_primary_node(node_id=n, write_freq=Constant.WRITE_FREQ) algo.node_relocation_process()
def __init__(self, flags): run_config = tf.compat.v1.ConfigProto(log_device_placement=False) run_config.gpu_options.allow_growth = True self.sess = tf.compat.v1.Session(config=run_config) self.flags = flags self.dataset = Dataset(self.sess, flags, self.flags.dataset) self.dataset.load_data() self.model = WGANTimeSeries(self.sess, self.flags, self.dataset) self._make_folders() self.iter_time = 0 self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) tf_utils.show_all_variables()
def load_data(x_data,source_data,length_data, batch_size): data_loader = None if x_data != '': X = pickle.load(open(x_data, 'rb')) source = pickle.load(open(source_data, 'rb')) length = pickle.load(open(length_data, 'rb')) data = Dataset(X,source,length) data_loader = DataLoader(data, batch_size=batch_size, shuffle = True) return data_loader
def test_missing_tree(self, tmp_root_datafile): with pytest.raises(ValueError) as e: _ = Dataset._build_dataframe(tmp_root_datafile, TTree_name='missing', cut_list_dicts=self.test_cut_dicts, vars_to_cut=self.test_vars_to_cut) assert str( e.value ) == f"TTree(s) 'missing' not found in file {tmp_root_datafile}"
def process_video(video_path): detector = initialize_detector() input_shape = (48, 48, 3) num_classes = 4 cnn_weights_path = 'model/weights.h5' DELTA = 15 dataset = Dataset() cnn = Model(input_shape, num_classes, cnn_weights_path) cap = cv2.VideoCapture(video_path) while (cap.isOpened()): _, frame = cap.read() frame = resize(frame, (NEW_HEIGHT, NEW_WIDTH), mode='constant') print(frame.shape) predictions = hot_predict('dummy', detector, image=frame) for bounding_box in predictions: x1 = int(bounding_box['x1']) - DELTA y1 = int(bounding_box['y1']) - DELTA x2 = int(bounding_box['x2']) + DELTA y2 = int(bounding_box['y2']) + DELTA traffic_sign = frame[y1:y2, x1:x2] processed_image = dataset._preprocess_image(traffic_sign, centered=True) cnn_input = np.expand_dims(processed_image, axis=0) label = cnn.predict(cnn_input) draw_rectangle(frame, (x1, y1, x2, y2), label) cv2.imshow('frame', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break cap.release() cv2.destroyAllWindows()
def test_duplicate_events_no_alt_tree(self, tmp_root_datafile_duplicate_events): with pytest.raises(Exception) as e: _ = Dataset._build_dataframe(tmp_root_datafile_duplicate_events, TTree_name=self.default_TTree, cut_list_dicts=self.test_cut_dicts, vars_to_cut=self.test_vars_to_cut) assert str( e.value ) == f"Found 1000 duplicate events in datafile {tmp_root_datafile_duplicate_events}."
def test_merge_process(self): data = Dataset(dataset_str='facebook') data.graph = nx.Graph() for i in range(10): data.graph.add_node(i) data.graph.add_edge(0, 1) data.graph.add_edge(0, 2) data.graph.add_edge(0, 3) data.graph.add_edge(0, 4) server_list = [Server(serer_id=i) for i in range(8)] algo = OfflineAlgo(server_list=server_list, network_dataset=data) node_list = list(data.graph.nodes) node_len = len(node_list) for i in range(node_len): n = node_list[i] algo.add_new_primary_node(node_id=n, write_freq=Constant.WRITE_FREQ) algo.init_merge_process() for i in range(0, len(algo.merged_node_list)): m_node = algo.merged_node_list[i] if m_node.id == 0: self.assertEqual(m_node.internal_connection, 0) self.assertEqual(m_node.external_connection, 4) elif m_node.id in [1, 2, 3, 4]: self.assertEqual(m_node.internal_connection, 0) self.assertEqual(m_node.external_connection, 1) else: self.assertEqual(m_node.internal_connection, 0) self.assertEqual(m_node.external_connection, 0) node_count_list = [] for m_node in algo.merged_node_list: node_count_list += m_node.node_id_list node_count_list.sort() self.assertEqual(node_count_list, [i for i in range(10)]) for i in range(1, len(algo.merged_node_list)): algo.merged_node_list[0]._add_node(algo.merged_node_list[i], algo=algo, remove_flag=False) node_count_list = algo.merged_node_list[0].node_id_list node_count_list.sort() self.assertEqual(node_count_list, [i for i in range(10)]) self.assertEqual(algo.merged_node_list[0].external_connection, 0) self.assertEqual(algo.merged_node_list[0].internal_connection, 4) self.assertEqual(algo.merged_node_list[0].node_count, 10)
def __init__(self, dsname, trainf='train_gdm', lyr=[nl.trans.TanSig(),nl.trans.TanSig()], lr=0.0001, epochs=100, update_freq=20, show=20, minmax=1.0, hid_lyr=10, ibias=1.0, norm=True): self.ds = Dataset.load(dsname) self.norm = norm self.set_train_data() # training parameters self.trainf = getattr(nl.train, trainf) self.lyr = lyr self.lr = lr self.epochs = epochs self.update_freq = update_freq self.show = show self.minmax = minmax self.hid_lyr = hid_lyr self.ibias = ibias # neural network self.set_train_kwargs() self.net = self.setup_network()