def train(): train_dataset = YoutubeTrendingDataset("./data/train.csv") train_dataloader = DataLoader(train_dataset, batch_size=1) preprocess_dict = { "publish_time": [TimeStampPreprocessing], "likes": [FrequencyPreprocessing], "category_id": [CategoricalPrerprocessing] } preprocessor = Preprocessor(preprocess_dict) for batch_ix, data in enumerate(train_dataloader): train_x, train_y = preprocessor.run(data)
def test_smoke(self): """ Smoke test - check that the preprocessor runs without exploding """ pp = Preprocessor(file_name=self.file_name, input_dir=self.input_dir, output_dir=self.output_dir) pp.process() out_dir_files = os.listdir(self.output_dir) for file_name in out_dir_files: name = utilities.path.get_name(self.file_name, extension=False) if name in file_name: file_path = os.path.join(self.output_dir, file_name) doc = Document.from_json(file_path) self.assertNotEqual(doc.pre_file_name, self.file_name) self.assertEqual(doc.file_name, 'test_preprocessed/lorem.json')
def by_infile(self, infile): try: shutil.rmtree(self.OUTPUT_DIR) except: pass self.db_open() json_data = self.get_events_from_infile(infile) # build preprocessor ppr = Preprocessor() # Process raw data #X, Y, events_found = ppr.get_raw_data(DIMENSION, [RAW_FILE], bad) X, Y, events_found = ppr.get_from_json(self.DIMENSION, json_data) X, Y = ppr.remove_outliers(X, Y) X, Y = ppr.normalize(X, Y) trX, trY, teX, teY, vaX, vaY = ppr.partition_for_training( X, Y, 0.0, 1.0) ppr.store_training_partitions(trX, trY, teX, teY, vaX, vaY, self.INPUT_DIR) # build adapter adapter = MACAdapter(self.INPUT_DIR, self.DIMENSION, self.FOLDS) # build model convnet = ConvNet(self.DIMENSION) # build server server = ConvNetServer(adapter, self.OUTPUT_DIR, batch_size=self.BATCH_SIZE, verbose=True, use=True) x, durs, _ = server.get_testing_batch() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) convnet.restore(sess, self.INITIAL_WEIGHTS) predictions = sess.run((convnet.predictor), feed_dict={ convnet.x: x, convnet.durs: durs }) # Get event ids _, _, ids = adapter.get_ids() results = [{ "eventID": int(ids[i]), "ml": { "aircraftProbability": round(np.around(predictions[i][0], decimals=4), 4), "model": self.MODEL } } for i in range(0, len(ids))] for result in results: self.insert_result_for_event(result) self.db_close()
# Set output path if args.out_path: out_path = Path(args.out_path) else: out_path = Path('data_' + str(patch_size) + '/test/masks_predicted_' + time.strftime("%y%m%d-%H%M%S")) if not out_path.exists(): out_path.mkdir(parents=True) # log all arguments including default ones with open(Path(out_path, 'options.json'), 'w') as f: f.write(json.dumps(vars(args))) # Preprocessing preprocessor = Preprocessor() if args.denoise: preprocessor.add_filter( filter.get_denoise_filter(args.denoise, args.denoise_parms)) # get loss function from function name loss_function = get_loss_function(args.loss, args.loss_parms) if 'bayes' in args.model or 'uncert' in args.model: mc_iterations = args.mc_iterations else: # set Nr iterations to 1 for regular u-net mc_iterations = 1 # 2-Stage Optimization Process if args.model == 'two_stage': #1st Stage
def main(): input_dir = "" output_dir = "" max_length_sentence = 100 lc = False verbose = False try: opts, args = getopt.getopt(sys.argv[1:], "hvls:i:o:", [ "help", "verbose", "lowercase", "max_sen_length=", "input_dir=", "output_dir=" ]) except getopt.GetoptError as err: print(str(err)) usage() sys.exit(2) for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() elif o in ("-v", "--verbose"): verbose = True elif o in ("-s", "--max_sen_length"): max_length_sentence = int(a) elif o in ("-l", "--lowercase"): lc = True elif o in ("-i", "--input_dir"): input_dir = os.path.expanduser(a) elif o in ("-o", "--output_dir"): output_dir = os.path.expanduser(a) else: assert False, "unhandled option" if not os.path.exists(input_dir): print("input directory does not exists... exiting") sys.exit() if output_dir == "": output_dir = input_dir if not os.path.exists(output_dir): os.makedirs(output_dir) if verbose: print("directory {} created".format(output_dir)) path_trainset = os.path.join(input_dir, TRAIN_NAME) path_testset = os.path.join(input_dir, TEST_NAME) if not os.path.exists(path_trainset): print("training set file is absent ({})".format(path_trainset)) sys.exit() if not os.path.exists(path_testset): print("test set file is absent ({})".format(path_testset)) sys.exit() path_save_voc_w = os.path.join(output_dir, VOCABULARY_NAME) path_save_voc_c = os.path.join(output_dir, CLASSES_NAME) prep_train = Preprocessor(path_trainset) prep_test = Preprocessor(path_testset, train=False) size_train = len(prep_train) size_test = len(prep_test) if verbose: print("Starting pre-processing on files of {} sentences".format( size_train + size_test)) voc_w, voc_c, max_length = get_vocabulary([prep_train, prep_test], verbose) voc_w.add(PAD) write_vocabulary(voc_w, path_save_voc_w) write_vocabulary(voc_c, path_save_voc_c) embeddings_size = len(voc_w) del voc_w, voc_c, prep_train vocab_words = load_vocabulary(path_save_voc_w) # Loading vocabularies as dictionary if verbose: print( "\nvocabulary loaded back ... {} words " "(might be different from before due to utf-8 encoding issues...)". format(len(vocab_words))) vocab_classes = load_vocabulary(path_save_voc_c) max_length = min(max_length, max_length_sentence) processing_words = get_word_preprocessing(vocab_words, max_length=max_length) processing_class = get_classes_preprocessing(vocab_classes) prep_to_int_train = Preprocessor(path_trainset, processing_words=processing_words, processing_class=processing_class) prep_to_int_test = Preprocessor(path_testset, train=False, processing_words=processing_words, processing_class=processing_class) train, _ = fill_matrix(size_train, max_length, prep_to_int_train, train=True) test, ids_test = fill_matrix(size_test, max_length, prep_to_int_test, train=False) np.save(os.path.join(output_dir, TRAIN_OUTPUT_NAME), train) np.save(os.path.join(output_dir, TEST_OUTPUT_NAME), test) with codecs.open(os.path.join(output_dir, IDS_OUTPUT_NAME), "w", encoding='utf-8') as f: f.write("\n".join(ids_test)) path_w2v = os.path.join(input_dir, EMBEDDINGS_INPUT_NAME) path_ngrams_w2v = os.path.join(input_dir, EMBEDDINGS_INPUT_NAME_NG) path_save_embeddings = os.path.join(output_dir, EMBEDDINGS_OUTPUT_NAME) min_n = 3 max_n = 6 create_embeddings(vocab_words, embeddings_size, path_w2v, path_ngrams_w2v, path_save_embeddings, min_n, max_n)
from utilities import file_io from sklearn.feature_extraction import FeatureHasher from sklearn.neighbors import KNeighborsClassifier import time from preprocessing.preprocessor import Preprocessor from LyricsProcessor import LyricsProcessor if __name__ == "__main__": #chunks = file_io.read_lastfm_user_art_file("data/userid-timestamp-artid-artname-traid-traname.tsv") chunks = file_io.read_lastfm_user_art_file("data/test_shorter.tsv") # read songs vectorizer = FeatureHasher() pre = Preprocessor(chunks, vectorizer) songs = pre.read_songs(20) print(songs) # reset file reader #chunks = file_io.read_lastfm_user_art_file("data/tmp.tsv") #pre.reset_file_reader(chunks) # read user song mapping pre.read_user_songs(1000) # convert to user-song matrix X = pre.get_user_song_matrix() start_time = time.time() clf = KNeighborsClassifier(n_neighbors=1) clf.fit(X, list(range(X.shape[0]))) print(clf.predict(pre.user_song_dict["user_000001"]))
type=int, default=1, help="# of nearest neighbors for classification") ap.add_argument( "-j", "--jobs", type=int, default=-1, help="# of jobs for k-NN distance (-1 uses all available cores)") args = vars(ap.parse_args()) print("[INFO] loading images...") imagePaths = list(paths.list_images(args["dataset"])) sp = Preprocessor(32, 32) sdl = DatasetLoader(preprocessors=[sp]) (data, labels) = sdl.load(imagePaths, verbose=1000) data = data.reshape((data.shape[0], 3072)) print("[INFO] features matrix: {:.1f}MB".format(data.nbytes / (1024 * 1000.0))) le = LabelEncoder() labels = le.fit_transform(labels) (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42) #k-NN print("[INFO] evaluating k-NN classifier...")
def main(argv): """Main entrypoint for the anonymization tool""" # Default parameters configuration_file = '' input_file = '' use_cache = True weight = 0.5 strategy = "gdf" result_dir = None # Read and set tool parameters try: opts, _ = getopt.getopt(argv, "c:i:r:w:v", ["config=", "input=", "weight=", "result_dir=", "verbose"]) except getopt.GetoptError: logger.error('experiment_runner.py -c <config_file> -i <input_file> -w <relational_weight>') sys.exit(2) for opt, arg in opts: if opt in ("-c", "--config"): configuration_file = arg if opt in ("-i", "--input"): input_file = arg base = os.path.basename(input_file) if not result_dir: result_dir = os.path.splitext(base)[0] if opt in ("-w", "--weight"): weight = float(arg) strategy = "mondrian" if opt in ("-r", "--result_dir"): result_dir = arg if opt in ("-v", "--verbose"): logging.getLogger().setLevel(logging.DEBUG) result_path = Path("experiment_results") / result_dir result_path.mkdir(parents=True, exist_ok=True) # Let's get started logger.info("Anonymizing input file %s", input_file) # Initialize and read configuration configuration_reader = ConfigurationReader() config = configuration_reader.read(configuration_file) # Read data using data types defined in the configuration data_reader = DataReader(config) df = data_reader.read(input_file) # Initialize the sensitive terms recognizer sensitive_terms_recognizer = SensitiveTermsRecognizer(config, use_cache) # Initialize the preprocessor (preprocessor is stateful, so pass df at the beginning) pp = Preprocessor(sensitive_terms_recognizer, config, df) # Run through preprocessing of dataframe: Data cleansing, analysis of textual attributes, resolving of redundant information, and compression pp.clean_textual_attributes() pp.analyze_textual_attributes() pp.find_redundant_information() pp.compress() # Get sensitive terms dictionary and preprocessed dataframe terms = pp.get_sensitive_terms() df = pp.get_df() # Initialize the anonymization kernel by providing the sensitive terms dictionary, the configuration, the sensitive terms recognizer, and the preprocessor kernel = AnonymizationKernel(terms, config, sensitive_terms_recognizer, pp) unanonymized = df # Determine k values for experiment k_values = [2, 3, 4, 5, 10, 20, 50] biases = config.get_biases() # Set strategy names if strategy == "mondrian": strategy_name = "mondrian-{}".format(weight) elif strategy == "gdf": strategy_name = strategy # Parameters for calculating metrics quasi_identifiers = config.get_quasi_identifiers() textual_attribute_mapping = pp.get_textual_attribute_mapping() # Prepare dataframes and json to store experiment results total_information_loss = pd.DataFrame(index=k_values, columns=[strategy_name]) total_information_loss.index.name = 'k' relational_information_loss = pd.DataFrame(index=k_values, columns=[strategy_name]) relational_information_loss.index.name = 'k' textual_information_loss = pd.DataFrame(index=k_values, columns=[strategy_name]) textual_information_loss.index.name = 'k' detailed_loss_level_0 = [k for k in textual_attribute_mapping] detailed_loss_level_1 = set() for k in textual_attribute_mapping: for e in textual_attribute_mapping[k]: detailed_loss_level_1.add(e.replace("{}_".format(k), '')) detailed_loss_level_1 = ["total"] + list(detailed_loss_level_1) detailed_textual_information_loss = pd.DataFrame(index=k_values, columns=pd.MultiIndex.from_product([detailed_loss_level_0, detailed_loss_level_1])) detailed_textual_information_loss.index.name = 'k' partition_sizes = {} partition_sizes[strategy_name] = {} partition_splits = {} partition_splits[strategy_name] = {} # Let's start the experiments for k in k_values: logger.info("-------------------------------------------------------------------------------") logger.info("Anonymizing dataset with k=%d and strategy %s", k, strategy_name) # Anonymize dataset for a specific k anonymized_df, partitions, partition_split_statistics = kernel.anonymize_quasi_identifiers(df, k, strategy, biases, weight) # Calculating the total, relational, and textual information loss based on the original and anonymized data frame total_il, relational_il, textual_il = calculate_normalized_certainty_penalty(unanonymized, anonymized_df, quasi_identifiers, textual_attribute_mapping) # Calculating the mean and std for partition size as well as split statistics mean_partition_size = calculate_mean_partition_size(partitions) std_partition_size = calculate_std_partition_size(partitions) if partition_split_statistics: number_of_relational_splits, number_of_textual_splits = get_partition_split_share(partition_split_statistics, textual_attribute_mapping) # Notify about the results logger.info("Information loss for relational attributes is %4.4f", relational_il) if textual_il: logger.info("Information loss for textual attribute is %4.4f", textual_il["total"]) logger.info("Total information loss is %4.4f", total_il) logger.info("Ended up with %d partitions with a mean size of %.2f and a std of %.2f", len(partitions), mean_partition_size, std_partition_size) if partition_split_statistics: logger.info("Split %d times on a relational attribute", number_of_relational_splits) logger.info("Split %d times on a textual attribute", number_of_textual_splits) # Store experiment results total_information_loss.at[k, strategy_name] = total_il relational_information_loss.at[k, strategy_name] = relational_il if textual_il: textual_information_loss.at[k, strategy_name] = textual_il["total"] for key in textual_il: if isinstance(textual_il[key], dict): for subkey in textual_il[key]: if subkey == "total": detailed_textual_information_loss.at[k, (key, "total")] = textual_il[key]["total"] else: entity_type = subkey.replace("{}_".format(key), '') detailed_textual_information_loss.at[k, (key, entity_type)] = textual_il[key][subkey] partition_sizes[strategy_name][k] = get_partition_lengths(partitions) if partition_split_statistics: partition_splits[strategy_name][k] = { "relational": number_of_relational_splits, "textual": number_of_textual_splits } # Define file info if strategy == "mondrian": file_info = str(weight).replace(".", "_") elif strategy == "gdf": file_info = strategy # Save the experiment results with open(result_path / 'partition_distribution_{}.json'.format(file_info), 'w') as f: json.dump(partition_sizes, f, ensure_ascii=False) if partition_split_statistics: with open(result_path / 'partition_splits_{}.json'.format(file_info), 'w') as f: json.dump(partition_splits, f, ensure_ascii=False) total_information_loss.to_csv(result_path / "total_information_loss_{}.csv".format(file_info)) relational_information_loss.to_csv(result_path / "relational_information_loss_{}.csv".format(file_info)) if textual_il: textual_information_loss.to_csv(result_path / "textual_information_loss_{}.csv".format(file_info)) detailed_textual_information_loss.to_csv(result_path / "detailed_textual_information_loss_{}.csv".format(file_info))
def __predict_output__(): plt.interactive(False) cfg = Configuration() GPU = True if GPU != True: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "" # Input Path root_dir = os.path.dirname(os.path.abspath(__file__)) image_path = cfg.image_path json_path = os.path.join(root_dir, cfg.input_filename) testingset = os.path.join(root_dir, 'testingset') Preprocessor.__generate_kijiji_set__(root_dir, image_path, json_path, testingset, 'model') # ------------------generator to compile training data of kijiji dataset---------------------------------------- image_path = os.path.join(root_dir, 'testingset') data_path = glob(image_path + "/*") # Image Segmentation Parameters model_path = os.path.expanduser(cfg.model_path) assert model_path.endswith('.h5'), 'Keras model must be a .h5 file.' anchors_path = os.path.expanduser(cfg.anchors_path) classes_path = os.path.expanduser(cfg.classes_path) test_path = os.path.expanduser(cfg.test_path) output_path = os.path.expanduser(cfg.segmented_output_path) json_path = os.path.expanduser(cfg.json_output) if not os.path.exists(output_path): print('Creating output path {}'.format(output_path)) os.mkdir(output_path) sess = K.get_session() class_names = Preprocessor.__return_class_names__(classes_path) anchors = Preprocessor.__return_anchors__(anchors_path) yolo_model = load_model(model_path) # Verify model, anchors, and classes are compatible num_classes = len(class_names) num_anchors = len(anchors) info = 'Mismatch between model and given anchor and class sizes. ' \ 'Specify matching anchors and classes with --anchors_path and --classes_path flags.' model_output_channels = yolo_model.layers[-1].output_shape[-1] assert model_output_channels == num_anchors * (num_classes + 5), info print('{} model, anchors, and classes loaded.'.format(model_path)) # Check if model is fully convolutional, assuming channel last order. model_image_size = yolo_model.layers[0].input_shape[1:3] is_fixed_size = model_image_size != (None, None) # Generate Colors for drawing bounding boxes hsv_tuples, colors = Preprocessor.__generate_colors_for_bounding_boxes__( class_names) yolo_outputs = yolo_head(yolo_model.output, anchors, len(class_names)) input_image_shape = K.placeholder(shape=(2, )) boxes, scores, classes = yolo_eval(yolo_outputs, input_image_shape, score_threshold=cfg.score_threshold, iou_threshold=cfg.iou_threshold) # Load Images from the root folder input_images_model_1, all_images, data_path, data_path_with_image_name = Preprocessor.__load_image_data_thumbnails__( data_path, cfg.compressed_image_height, cfg.compressed_image_width, cfg.compressed_channel, cfg.number_of_categories, cfg.number_of_images_per_category, root_dir, is_fixed_size, model_image_size, sess, yolo_model, input_image_shape, boxes, scores, classes, cfg.font_path, class_names, colors, output_path, json_path, test_path, True, # Segmentation Flag False, # Edge-detection Flag True, # Extract object Flag False) # Gray Scale Flag input_images_model_2, all_images, data_path, data_path_with_image_name = Preprocessor.__load_image_data_thumbnails__( data_path, cfg.compressed_image_height, cfg.compressed_image_width, cfg.compressed_channel, cfg.number_of_categories, cfg.number_of_images_per_category, root_dir, is_fixed_size, model_image_size, sess, yolo_model, input_image_shape, boxes, scores, classes, cfg.font_path, class_names, colors, output_path, json_path, test_path, False, True, False, False) input_images_model_3, all_images, data_path, data_path_with_image_name = Preprocessor.__load_image_data_thumbnails__( data_path, cfg.image_height, cfg.image_width, cfg.channel, cfg.number_of_categories, cfg.number_of_images_per_category, root_dir, is_fixed_size, model_image_size, sess, yolo_model, input_image_shape, boxes, scores, classes, cfg.font_path, class_names, colors, output_path, json_path, test_path, False, False, False, False) input_shape = [ cfg.compressed_image_height, cfg.compressed_image_width, cfg.compressed_channel ] input_shape_3 = [cfg.image_height, cfg.image_width, cfg.channel] # load (pre-trained) weights for model_1 print('-' * 30) print('Loading model weights...\n') weight_folder = cfg.model_1_save # the path where the model weights are stored weight_file = 'model_1.h5' model_1 = Preprocessor.__load_model_weights__(weight_folder, weight_file, input_shape, input_shape_3, "Model_1") # load (pre-trained) weights for model_2 print('-' * 30) print('Loading model weights...\n') weight_folder = cfg.model_2_save # the path where the model weights are stored weight_file = 'model_2.h5' model_2 = Preprocessor.__load_model_weights__(weight_folder, weight_file, input_shape, input_shape_3, "Model_2") # load (pre-trained) weights for model_2 print('-' * 30) print('Loading model weights...\n') weight_folder = cfg.model_3_save # the path where the model weights are stored weight_file = 'model_3.h5' model_3 = Preprocessor.__load_model_weights__(weight_folder, weight_file, input_shape, input_shape_3, "Model_3") print(root_dir) print(os.path.join(root_dir, cfg.output_model_1)) output_path_model_1 = os.path.join(root_dir + cfg.output_model_1) output_path_model_2 = os.path.join(root_dir + cfg.output_model_2) output_path_model_3 = os.path.join(root_dir + cfg.output_model_3) Preprocessor.__create_output_directories__(output_path_model_1) Preprocessor.__create_output_directories__(output_path_model_2) Preprocessor.__create_output_directories__(output_path_model_3) features_from_model_1 = Preprocessor.__get_score_model__( model_1, input_images_model_1, output_path_model_1) features_from_model_2 = Preprocessor.__get_score_model__( model_2, input_images_model_2, output_path_model_2) features_from_model_3 = Preprocessor.__get_score_model__( model_3, input_images_model_3, output_path_model_3) features_from_model_1 = Preprocessor.__flatten_img_data__( features_from_model_1) features_from_model_2 = Preprocessor.__flatten_img_data__( features_from_model_2) features_from_model_3 = Preprocessor.__flatten_img_data__( features_from_model_3) fused_features = np.concatenate([ features_from_model_1, features_from_model_2, features_from_model_3 ], axis=1) fused_features = [ Preprocessor.__binarize__(features) for features in fused_features ] counter_for_predictions = 0 sub_average_precision_make, sub_average_precision_color = [], [] sub_average_precision_body, sub_average_precision_model = [], [] cum_average_precision_make, cum_average_precision_color = [], [] cum_average_precision_body, cum_average_precision_model = [], [] precision_at_3_5_10_all = ''.join(cfg.precision_counter).split(',') while counter_for_predictions <= 2: test_image_idx = int(len(input_images_model_1) * random()) if test_image_idx < len(data_path_with_image_name): idx_closest = Preprocessor.__get_closest_images__( test_image_idx, fused_features, cfg.number_of_predictions) test_image = Preprocessor.__get_concatenated_images__( data_path_with_image_name, [test_image_idx], cfg.compressed_image_width) results_image = Preprocessor.__get_concatenated_images__( data_path_with_image_name, idx_closest, cfg.compressed_image_width) source_category = str( data_path_with_image_name[test_image_idx]).split('/') similar_image = [] similar_idx_closest = [] for counter_for_recommendations in range(0, len(idx_closest)): category = str(data_path_with_image_name[ idx_closest[counter_for_recommendations]]).split('/') if str(source_category[-2]).strip() == str( category[-2].strip()): similar_image.append(data_path_with_image_name[ idx_closest[counter_for_recommendations]]) similar_idx_closest.append( idx_closest[counter_for_recommendations]) print("Test Image ID:", test_image_idx) print("\n") print("Closest Images ID:", idx_closest) print("\n") print("Similar Images ID", similar_idx_closest) print("\n") precision_per_make, precision_per_color = [], [] precision_per_body_wise, precision_per_model_wise = [], [] results_image_recommendations = [] for i in range(0, len(precision_at_3_5_10_all)): results_image_recommendations = Preprocessor.__get_concatenated_images__( data_path_with_image_name, similar_idx_closest, cfg.compressed_image_width) list_of_similar_image_names = Preprocessor.__return_image_names__( data_path_with_image_name, similar_idx_closest) name_of_test_image = Preprocessor.__return_image_names__( data_path_with_image_name, [test_image_idx]) dict_of_attributes_of_similar_images = Preprocessor.__get_attributes_list__( list_of_similar_image_names, os.path.join(root_dir, cfg.input_filename)) dict_of_attributes_of_test_image = Preprocessor.__get_attributes_list__( name_of_test_image, os.path.join(root_dir, cfg.input_filename)) similar_make_wise = Preprocessor.__get_similar__( dict_of_attributes_of_test_image, dict_of_attributes_of_similar_images[:int( precision_at_3_5_10_all[i])], 'make') similar_color_wise = Preprocessor.__get_similar__( dict_of_attributes_of_test_image, dict_of_attributes_of_similar_images[:int( precision_at_3_5_10_all[i])], 'color') similar_body_wise = Preprocessor.__get_similar__( dict_of_attributes_of_test_image, dict_of_attributes_of_similar_images[:int( precision_at_3_5_10_all[i])], 'body') similar_model_wise = Preprocessor.__get_similar__( dict_of_attributes_of_test_image, dict_of_attributes_of_similar_images[:int( precision_at_3_5_10_all[i])], 'model') precision_per_make.append( float( float(len(similar_make_wise)) / int(precision_at_3_5_10_all[i]))) precision_per_color.append( float( float(len(similar_color_wise)) / int(precision_at_3_5_10_all[i]))) precision_per_body_wise.append( float( float(len(similar_body_wise)) / int(precision_at_3_5_10_all[i]))) precision_per_model_wise.append( float( float(len(similar_model_wise)) / int(precision_at_3_5_10_all[i]))) sub_average_precision_make.append(precision_per_make) sub_average_precision_color.append(precision_per_color) sub_average_precision_body.append(precision_per_body_wise) sub_average_precision_model.append(precision_per_model_wise) imsave('test.png', test_image) imsave('recommendations.png', results_image_recommendations) imsave('total_results.png', results_image) counter_for_predictions += 1 time.sleep(1) else: print("Index is out of bound") cum_average_precision_make.append( map(Preprocessor.__mean__, zip(*sub_average_precision_make))) cum_average_precision_color.append( map(Preprocessor.__mean__, zip(*sub_average_precision_color))) cum_average_precision_body.append( map(Preprocessor.__mean__, zip(*sub_average_precision_body))) cum_average_precision_model.append( map(Preprocessor.__mean__, zip(*sub_average_precision_model))) print("\n \n \n") print( "-----------------------------------------------------------------------------------" ) print("Average Precision Make-Wise", precision_at_3_5_10_all, map(Preprocessor.__mean__, zip(*cum_average_precision_make))) print("Average Precision Color-Wise", precision_at_3_5_10_all, map(Preprocessor.__mean__, zip(*cum_average_precision_color))) print("Average Precision Body-Wise", precision_at_3_5_10_all, map(Preprocessor.__mean__, zip(*cum_average_precision_body))) print("Average Precision Model-Wise", precision_at_3_5_10_all, map(Preprocessor.__mean__, zip(*cum_average_precision_model))) writer = csv.writer(open(os.path.join(root_dir, 'results.csv'), 'w')) writer.writerow([ "Make-Wise: Precision at 3", "Make-Wise: Precision at 5", "Make-Wise: Precision at 10" ]) for row in zip(*cum_average_precision_make): writer.writerow(row) writer.writerow('\n') writer.writerow([ "Color-Wise: Precision at 3", "Color-Wise: Precision at 5", "Color-Wise: Precision at 10" ]) for row in zip(*cum_average_precision_color): writer.writerow(row) writer.writerow('\n') writer.writerow([ "Body-Wise: Precision at 3", "Body-Wise: Precision at 5", "Body-Wise: Precision at 10" ]) for row in zip(*cum_average_precision_body): writer.writerow(row) writer.writerow('\n') writer.writerow([ "Model-Wise: Precision at 3", "Model-Wise: Precision at 5", "Model-Wise: Precision at 10" ]) for row in zip(*cum_average_precision_model): writer.writerow(row) writer.writerow('\n')
from model.nn import BasicNeuralNetwork from preprocessing.preprocessor import Preprocessor from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score from sklearn.decomposition import PCA, FastICA BASE_DIR = 'data/' ######################### # Preprocess data ######################### train = pd.read_csv(BASE_DIR + 'train.csv') test = pd.read_csv(BASE_DIR + 'test.csv') preprocessor = Preprocessor(magicFeature=True) train_p, test_p = preprocessor.transform(train, test) ######################### # Create models ######################### gb = GradientBoostingRegressor(n_estimators=1000, max_features=0.95, learning_rate=0.005, max_depth=4) las = Lasso(alpha=5) lgb = { 'objective': 'regression', 'metric': 'rmse', 'boosting': 'gbdt',
def main(argv): """Main entrypoint for the anonymization tool""" # Default parameters configuration_file = '' input_file = '' output_file = '' use_cache = False # Read and set tool parameters try: opts, _ = getopt.getopt( argv, "c:i:o:vs", ["config=", "input=", "output=", "verbose", "use_chached_docs"]) except getopt.GetoptError: logger.error( 'main.py -c <config_file> -i <input_file> -o <output_file>') sys.exit(2) for opt, arg in opts: if opt in ("-c", "--config"): configuration_file = arg if opt in ("-i", "--input"): input_file = arg if opt in ("-o", "--output"): output_file = arg if opt in ("-s", "--use_chached_docs"): use_cache = True if opt in ("-v", "--verbose"): logging.getLogger().setLevel(logging.DEBUG) # Let's get started logger.info("Anonymizing input file %s", input_file) # Initialize and read configuration configuration_reader = ConfigurationReader() config = configuration_reader.read(configuration_file) # Read data using data types defined in the configuration data_reader = DataReader(config) df = data_reader.read(input_file) # Initialize the sensitive terms recognizer sensitive_terms_recognizer = SensitiveTermsRecognizer(config, use_cache) # Initialize the preprocessor (preprocessor is stateful, so pass df at the beginning) pp = Preprocessor(sensitive_terms_recognizer, config, df) # Run through preprocessing of dataframe: Data cleansing, analysis of textual attributes, resolving of redundant information, and compression pp.clean_textual_attributes() pp.analyze_textual_attributes() pp.find_redundant_information() pp.compress() # Get sensitive terms dictionary and preprocessed dataframe terms = pp.get_sensitive_terms() df = pp.get_df() # Initialize the anonymization kernel by providing the sensitive terms dictionary, the configuration, the sensitive terms recognizer, and the preprocessor kernel = AnonymizationKernel(terms, config, sensitive_terms_recognizer, pp) # Save the unanonymized dataframe for later unanonymized_df = df.copy() # Parameters for anonymization k = config.parameters["k"] strategy = config.parameters["strategy"] biases = config.get_biases() relational_weight = config.get_relational_weight() # Anonymize quasi identifier (applying k-anonymity) and recode textual attributes anonymized_df, partitions, partition_split_statistics = kernel.anonymize_quasi_identifiers( df, k, strategy, biases, relational_weight) anonymized_df = kernel.recode_textual_attributes(anonymized_df) # Parameters for calculating metrics quasi_identifiers = config.get_quasi_identifiers() textual_attribute_mapping = pp.get_textual_attribute_mapping() # Calculating the total, relational, and textual information loss based on the original and anonymized data frame total_information_loss, relational_information_loss, textual_information_loss = calculate_normalized_certainty_penalty( unanonymized_df, anonymized_df, quasi_identifiers, textual_attribute_mapping) # Calculating the mean and std for partition size as well as split statistics mean_partition_size = calculate_mean_partition_size(partitions) std_partition_size = calculate_std_partition_size(partitions) if partition_split_statistics: number_of_relational_splits, number_of_textual_splits = get_partition_split_share( partition_split_statistics, textual_attribute_mapping) # Notify about the results logger.info("Information loss for relational attributes is %4.4f", relational_information_loss) if textual_information_loss: logger.info("Information loss for textual attribute is %4.4f", textual_information_loss["total"]) logger.info("Total information loss is %4.4f", total_information_loss) logger.info( "Ended up with %d partitions with a mean size of %.2f and a std of %.2f", len(partitions), mean_partition_size, std_partition_size) if partition_split_statistics: logger.info("Split %d times on a relational attribute", number_of_relational_splits) logger.info("Split %d times on a textual attribute", number_of_textual_splits) # Initialize the postprocessor with the config and the preprocessor post_processor = PostProcessor(config, pp) # Perform post processing actions on the anonymized data frame anonymized_df = post_processor.clean(anonymized_df) anonymized_df = post_processor.uncompress(anonymized_df) anonymized_df = post_processor.pretty(anonymized_df) # Don't forget to drop the direct identifiers since they are now not needed anymore anonymized_df = kernel.remove_direct_identifier(anonymized_df) # Notify and save logger.info("Saving anonymized file to %s", output_file) anonymized_df.to_csv(output_file, index=False)
from metrics import evaluation from utilities import cluster_cf if __name__ == "__main__": start_time = time.time() np.set_printoptions(threshold=np.nan) vectorizer = DictVectorizer() # reset file reader chunks = file_io.read_lastfm_user_art_file("data/halfid_20%_train.tsv") valid_songs = [] # don't filter with valid songs valid_songs = file_io.get_all_valid_songs('data/song_word2vec_whole_truncate_60000_new.csv') pre = Preprocessor(chunks, vectorizer, valid_songs) pre.reset_file_reader(chunks) # read user song mapping pre.read_user_songs(3000000) # convert to user-song matrix X = pre.get_user_song_matrix() print("non zeros: {0}".format(X.count_nonzero())) print("pre-processed in {0:.2f} sec".format(time.time() - start_time)) #cluster_cf.cluster_usr(X, k=5) print("non zeros: {0}".format(X.count_nonzero())) pred = recommendation.predict_by_user(X) #pred = recommendation.predict_by_factorize(X) recommended = recommendation.recommend_all(X, pred, masked=False)
print('export_path = {}\n'.format(export_path)) if os.path.isdir(export_path): print('\nAlready saved a model, cleaning up\n') from keras import backend sess = backend.get_session() tf.compat.v1.saved_model.simple_save( sess, export_path, inputs = {'input_image':model.input}, outputs = {t.name:t for t in model.outputs}) # working with dataload_ecg df_train, df_test = Preprocessor.load_mitbih("") df_train_ecg, df_test_ecg = Preprocessor.load_ecg("") df_train.append(df_train_ecg) df_test.append(df_test_ecg) X_train, y_train, X_val, y_val, X_test, y_test = Preprocessor.CNN_preprocessor(df_train, df_test, 800, 2000) # define model n_obs, feature, depth = X_train.shape model = CNNModel.get_model(n_obs, feature, depth) h_params = CNNModel.CNN_hyperparameters(n_obs) model.compile(loss = h_params ['loss'], optimizer = h_params ['optimizer'], metrics = h_params ['metrics']) history = model.fit(X_train, y_train,
OUTPUT_DIR = "use_out" # dummy to make the network happy BATCH_SIZE = None ''' Network ''' # file location of weights to restore from (i.e. weights/model1.ckpt) INITIAL_WEIGHTS = 'checkpoints/cvd_model.ckpt' ''' SCRIPT ''' # Only run if this is the main module to be run if __name__ == '__main__': # build preprocessor ppr = Preprocessor() # Process raw data X, Y, events_found = ppr.get_raw_data(DIMENSION, [RAW_FILE], bad) X, Y = ppr.remove_outliers(X, Y) X, Y = ppr.normalize(X, Y) trX, trY, teX, teY, vaX, vaY = ppr.partition_for_training(X, Y, 0.0, 1.0) ppr.store_training_partitions(trX, trY, teX, teY, vaX, vaY, INPUT_DIR) # build adapter adapter = MACAdapter(INPUT_DIR, DIMENSION, FOLDS) # build model convnet = ConvNet(DIMENSION) # build server
if not Path(args.gt_path).exists(): print(args.gt_path + " does not exist") exit(-1) if args.uncert_path: if not Path(args.uncert_path).exists(): print(args.uncert_path + " does not exist") exit(-1) uncert_path = Path(args.uncert_path) else: uncert_path = None model_path = Path(args.model_path) options = json.load(open(Path(model_path, 'options.json'), 'r')) # Setup Preprocessing filters preprocessor = Preprocessor() if 'denoise' in options: if 'denoise_parms' in options: preprocessor.add_filter( filter.get_denoise_filter(options['denoise'])) else: preprocessor.add_filter( filter.get_denoise_filter(options['denoise'], options['denoise_parms'])) if 'contrast' in options and options['contrast']: clahe = cv2.createCLAHE( clipLimit=2.0, tileGridSize=(25, 25)) # CLAHE adaptive contrast enhancement preprocessor.add_filter(clahe.apply)
def compare_preprocessing(): # Loading train and test data: all_categories = [ 'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] print("Loading 20 newsgroups...") newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=all_categories) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=all_categories) print("{} training documents loaded.".format( newsgroups_train.filenames.shape[0])) print("Buidling Preprocessor combinations...") # flags: special_character_removal, number_removal, url_email_removal, stopword_removal, lower, stemming, lemmatize num_of_preprocessor_flags = 7 # Creates a list of all possible permutations of a boolean list with the length of number of flags booleans = [ False, True ] # Creates a list of all possible permutations of a boolean list flags_list = [ list(b) for b in itertools.product(booleans, repeat=num_of_preprocessor_flags) ] invalid_flags = [] for i in range(len(flags_list)): if flags_list[i][5] and flags_list[i][ 6]: # Removes simultaneous Stemming and Lemmatization invalid_flags.append(flags_list[i]) elif flags_list[i][5] and not flags_list[i][ 4]: # Remove Stemming without lowercase (lowercase is inbuilt) invalid_flags.append(flags_list[i]) flags_list = [x for x in flags_list if x not in invalid_flags] print("{} Combinations built.".format(len(flags_list))) # Initialize vectorizer, machine learning algorithm and data frame to store the results vectorizer = TfidfVectorizer(analyzer="word", tokenizer=dummy, lowercase=False, preprocessor=dummy, stop_words=None) clf = MultinomialNB(alpha=.01) columns = [ 'Special Character Removal', 'Number Removal', 'URL and E-Mail Removal', 'Stopword Removal', 'Lowercase', 'Stemming', 'Lemmatization', 'Unique Words', 'Accuracy' ] rows = [] for flags in flags_list: # loops through all combinations prep = Preprocessor(special_character_removal=flags[0], number_removal=flags[1], url_email_removal=flags[2], stopword_removal=flags[3], lower=flags[4], stemming=flags[5], lemmatize=flags[6]) preprocessed_train_data = [ prep.preprocess(d) for d in newsgroups_train.data ] preprocessed_test_data = [ prep.preprocess(d) for d in newsgroups_test.data ] vectors = vectorizer.fit_transform(preprocessed_train_data) # Train machine learning model clf.fit(vectors, newsgroups_train.target) # Transform test data to the model fitted to the training data vectors_test = vectorizer.transform(preprocessed_test_data) # Evaluate pred = clf.predict(vectors_test) vocab = vectors.shape[1] accuracy = metrics.accuracy_score(newsgroups_test.target, pred) rows.append([ flags[0], flags[1], flags[2], flags[3], flags[4], flags[5], flags[6], vocab, accuracy ]) print( "Spec: {} , Numbers: {} , EmailUrl: {} , SWR: {}, low: {}, Stem: {} , Lem: {} -> Vocab: {}, Acc: {}" .format(flags[0], flags[1], flags[2], flags[3], flags[4], flags[5], flags[6], vocab, accuracy)) # Organize data frame and save the results df = pd.DataFrame(np.array(rows), columns=columns) df = df.sort_values(by=['Accuracy'], ascending=False) pprint(df) df.to_csv('results.csv', sep=';')
if __name__ == "__main__": csrc_path = argv[0] with open(csrc_path) as fl: avr_code = gcc.compile(fl.read()) with open("lib/avrheader.sap") as fl: avrheader = fl.read() translator = Translator(avrheader) sap = translator.to_sap(avr_code) with open("build/build.sap.superset", "w") as fl: fl.write(sap) proc = Preprocessor() proc.load_extension("ext/sapplus.json") sap = proc.preprocess(sap) with open("build/build.sap", "w") as fl: fl.write(sap) out = assemble(sap) if out.success: pass else: print("SAP output did not compile successfully")
# The location in which to save the model SAVE_NAME = "example_training.ckpt" ''' SCRIPT ''' # Only run if this is the main module to be run if __name__ == '__main__': # JSON object returned from api_call # replace this with however you would like it to work in production json_data = json.load(open(EXAMPLE_FILE)) # NOTE if events in json object have neither "aircraft" nor "community" fields # in they will be labeled as community for training - probably try to avoid this # build preprocessor ppr = Preprocessor() # Process raw data #X, Y, events_found = ppr.get_raw_data(DIMENSION, [RAW_FILE], bad) X, Y, events_found = ppr.get_from_json(DIMENSION, json_data) X, Y = ppr.remove_outliers(X, Y) X, Y = ppr.normalize(X, Y) # Shove all events into the "training" subdirectory trX, trY, teX, teY, vaX, vaY = ppr.partition_for_training(X, Y, 1.0, 0.0) # Store events in intermediate directory (will be deleted on subsequent trainings) ppr.store_training_partitions(trX, trY, teX, teY, vaX, vaY, INPUT_DIR) # build adapter adapter = MACAdapter(INPUT_DIR, DIMENSION, FOLDS) # build model
""" Demo file for usage of the Preprocessor class Initialize the object with the required parameters, then call the process_images() method """ import sys, os lib_path = os.path.abspath(os.path.join(__file__, '../..')) sys.path.append(lib_path) from preprocessing.preprocessor import Preprocessor # inputs size = 299 data_dir = './data/' annotations_dir = './annotations/' dest_dir = './data_preprocessed_{}/'.format(size) train_annotations = '{}train2017.json'.format(annotations_dir) val_annotations = '{}val2017.json'.format(annotations_dir) preprocessor_train = Preprocessor(data_dir, train_annotations, (size, size)) preprocessor_train.process_images(dest_dir) preprocessor_val = Preprocessor(data_dir, val_annotations, (size, size)) preprocessor_val.process_images(dest_dir) print("Preprocessing in", dest_dir, "completed.")
def __run_training__(): cfg = Configuration() # These variable would be parametrized GPU = True if GPU != True: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "" # Input Path root_dir = os.path.dirname(os.path.abspath(__file__)) image_path = cfg.image_path json_path = os.path.join(root_dir, cfg.input_filename) trainingset = os.path.join(root_dir, 'trainingset') Preprocessor.__generate_kijiji_set__(root_dir, image_path, json_path, trainingset, 'make') # -------------------------------------------------------------------------------------------------------------- image_path = os.path.join(root_dir, 'trainingset') data_path = glob(image_path + "/*") # Image Segmentation Parameters model_path = os.path.expanduser(cfg.model_path) assert model_path.endswith('.h5'), 'Keras model must be a .h5 file.' anchors_path = os.path.expanduser(cfg.anchors_path) classes_path = os.path.expanduser(cfg.classes_path) test_path = os.path.expanduser(cfg.test_path) output_path = os.path.expanduser(cfg.segmented_output_path) json_path = os.path.expanduser(cfg.json_output) if not os.path.exists(output_path): print('Creating output path {}'.format(output_path)) os.mkdir(output_path) sess = K.get_session() class_names = Preprocessor.__return_class_names__(classes_path) anchors = Preprocessor.__return_anchors__(anchors_path) yolo_model = load_model(model_path) # Verify model, anchors, and classes are compatible num_classes = len(class_names) num_anchors = len(anchors) info = 'Mismatch between model and given anchor and class sizes. ' \ 'Specify matching anchors and classes with --anchors_path and --classes_path flags.' model_output_channels = yolo_model.layers[-1].output_shape[-1] assert model_output_channels == num_anchors * (num_classes + 5), info print('{} model, anchors, and classes loaded.'.format(model_path)) # Check if model is fully convolutional, assuming channel last order. model_image_size = yolo_model.layers[0].input_shape[1:3] is_fixed_size = model_image_size != (None, None) # Generate Colors for drawing bounding boxes hsv_tuples, colors = Preprocessor.__generate_colors_for_bounding_boxes__( class_names) yolo_outputs = yolo_head(yolo_model.output, anchors, len(class_names)) input_image_shape = K.placeholder(shape=(2, )) boxes, scores, classes = yolo_eval(yolo_outputs, input_image_shape, score_threshold=cfg.score_threshold, iou_threshold=cfg.iou_threshold) # Load Images from the root folder input_images_model_1, all_images, data_path, data_path_with_image_name = Preprocessor.__load_image_data_thumbnails__( data_path, cfg.compressed_image_height, cfg.compressed_image_width, cfg.compressed_channel, cfg.number_of_categories, cfg.number_of_images_per_category, root_dir, is_fixed_size, model_image_size, sess, yolo_model, input_image_shape, boxes, scores, classes, cfg.font_path, class_names, colors, output_path, json_path, test_path, True, # Segmentation Flag False, # Edge-detection Flag True, # Extract object Flag False) # Gray Scale Flag input_images_model_2, all_images, data_path, data_path_with_image_name = Preprocessor.__load_image_data_thumbnails__( data_path, cfg.compressed_image_height, cfg.compressed_image_width, cfg.compressed_channel, cfg.number_of_categories, cfg.number_of_images_per_category, root_dir, is_fixed_size, model_image_size, sess, yolo_model, input_image_shape, boxes, scores, classes, cfg.font_path, class_names, colors, output_path, json_path, test_path, False, # Segmentation Flag True, # Edge-detection Flag False, # Extract object Flag False) # Gray Scale Flag input_images_model_3, all_images, data_path, data_path_with_image_name = Preprocessor.__load_image_data_thumbnails__( data_path, cfg.image_height, cfg.image_width, cfg.channel, cfg.number_of_categories, cfg.number_of_images_per_category, root_dir, is_fixed_size, model_image_size, sess, yolo_model, input_image_shape, boxes, scores, classes, cfg.font_path, class_names, colors, output_path, json_path, test_path, False, # Segmentation Flag False, # Edge-detection Flag False, # Extract object Flag False) # Gray Scale Flag input_shape = [ cfg.compressed_image_height, cfg.compressed_image_width, cfg.compressed_channel ] input_shape_3 = [cfg.image_height, cfg.image_width, cfg.channel] # Model Save Paths model_1_save_path = os.path.join(root_dir + cfg.model_1_save) model_2_save_path = os.path.join(root_dir + cfg.model_2_save) model_3_save_path = os.path.join(root_dir + cfg.model_3_save) Preprocessor.__create_output_directories__(model_1_save_path) Preprocessor.__create_output_directories__(model_2_save_path) Preprocessor.__create_output_directories__(model_3_save_path) # Instantiating the training class train = Train(input_images_model_1, input_images_model_2, input_images_model_3, input_shape, input_shape_3, cfg.batch_size, cfg.epochs, model_1_save_path, model_2_save_path, model_3_save_path) # Output Path output_path_model_1 = os.path.join(root_dir + cfg.output_model_1) output_path_model_2 = os.path.join(root_dir + cfg.output_model_2) output_path_model_3 = os.path.join(root_dir + cfg.output_model_3) Preprocessor.__create_output_directories__(output_path_model_1) Preprocessor.__create_output_directories__(output_path_model_2) Preprocessor.__create_output_directories__(output_path_model_3) # FCN Model model_1 = train.__train_model_1__() # VGG Model model_2 = train.__train_model_2__() # Inception-v3 model_3 = train.__train_model_3__() features_from_model_1 = Preprocessor.__get_score_model__( model_1, input_images_model_1, output_path_model_1) features_from_model_2 = Preprocessor.__get_score_model__( model_2, input_images_model_2, output_path_model_2) features_from_model_3 = Preprocessor.__get_score_model__( model_3, input_images_model_3, output_path_model_3) print("Output FeatureMap For Model 1 \n") print(features_from_model_1.shape) print("\n") print("Output FeatureMap For Model 2 \n") print(features_from_model_2.shape) print("\n") print("Output FeatureMap For Model 3 \n") print(features_from_model_3.shape) print("\n")