def processing(): keyword_list = helpers.load_dataset(ds.output_data + "keywords/keywords_single_list.csv") store = {} keyword_list = list_creator(keyword_list) file_paths = [] for df in ds.all_datasets: print(" - Processing", df) f_name = df store[f_name] = {} df = helpers.load_dataset(ds.dataset + df) df = df[df.tweet_language == "en"] for index, row in df.iterrows(): matches = check_keyword(clean_tweet(row.tweet_text), keyword_list) if len(matches) != 0: store[f_name][row.tweetid] = matches # # storage matches_counter = 0 for f_name in store: data_list = [] filename = f_name.split("/") dataset = filename[0] filename = filename[1] path = ds.output_data + "individual_keyword_matches/" dataset_path = path + dataset + "/" helpers.path_checker(dataset_path) file_path = dataset_path + filename for item in store[f_name]: data_list.append([item, store[f_name][item]]) matches_counter += 1 helpers.data_to_file_two_values(data_list, '"tweet_id","matches"', file_path) file_paths.append(file_path) return file_paths
def get_undetected_red(dataset='test'): if dataset == 'test': raw_images = helpers.load_dataset(IMAGE_DIR_TEST) elif dataset == 'training': raw_images = helpers.load_dataset(IMAGE_DIR_TRAINING) images = standardize(raw_images) und_red = [] # Iterate through all the test images # Classify each image and compare to the true label for i in range(len(images)): image = images[i] # Get true data im = image[0] true_label = image[1] predicted_label = estimate_label(im) if (predicted_label != true_label): # If these labels are not equal, the image has been misclassified if predicted_label == [0, 0, 1] and true_label == [1, 0, 0]: und_red.append((im, predicted_label, true_label, i)) # Accuracy calculations total = len(images) print("Number of undetected red in ", dataset, " = ", len(und_red), ' out of ', len(images)) for i in und_red: print("\tImage num ", i[3], " in dataset ", dataset, " false detected ", i[1]) return und_red
def load_data(): """Загрзка данных тренироваочные, валидационные, тестовые""" try: IMAGE_DIR_TRAIN = "data/training" TRAIN_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TRAIN) TRAIN_OBJECTS_LIST = helpers.load_objects(TRAIN_IMAGE_LIST, IMAGE_DIR_TRAIN) except Exception: TRAIN_OBJECTS_LIST = [] try: IMAGE_DIR_TEST = "data/test" TEST_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TEST) TEST_OBJECTS_LIST = helpers.load_objects(TEST_IMAGE_LIST, IMAGE_DIR_TEST) except Exception: TEST_OBJECTS_LIST = [] try: IMAGE_DIR_VAL = "data/val" VAL_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_VAL) VAL_OBJECTS_LIST = helpers.load_objects(VAL_IMAGE_LIST, IMAGE_DIR_VAL) except Exception: VAL_OBJECTS_LIST = [] return TRAIN_OBJECTS_LIST, TEST_OBJECTS_LIST, VAL_OBJECTS_LIST
def negation_handled (folder, n_grams): data = helpers.load_dataset(ds.dataset + ds.negate_dataset) results_files = get_results_filenames(folder) for results_file in results_files: mpt = results_file.split("_")[0] if mpt == "best": continue mpt = int(mpt) results_df = helpers.load_dataset(folder + results_file) results_df = get_first_experimental_results(data, mpt, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_file)
def execute(folder, n_grams): results_files = get_results_filenames(folder) data = helpers.load_dataset(ds.dataset + ds.file) for results_file in results_files: emotion = results_file.split("_")[0] if emotion == "best": continue results_df = helpers.load_dataset(folder + results_file) results_df = get_first_experimental_results(data, emotion, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_file)
def execute (folder, n_grams): results_files = get_results_filenames(folder) data = helpers.load_dataset(ds.dataset + ds.file) experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/emotion_detection/" + n_grams + "/next_experiments.csv") for results_filename in results_files: emotion = results_filename.split("_")[0] if emotion == "best": continue experiments = experiments_df[experiments_df.emotion == emotion] results_df = helpers.load_dataset(folder + results_filename) results_df = process_experiments(data, emotion, experiments, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_filename) return
def tst_estimate_label(im_num, dataset='test'): if dataset == 'test': raw_images = helpers.load_dataset(IMAGE_DIR_TEST) elif dataset == 'training': raw_images = helpers.load_dataset(IMAGE_DIR_TRAINING) images = standardize(raw_images) image = images[im_num] im = image[0] true_label = image[1] predicted_label = estimate_label(im) print(predicted_label)
def process_negation_handled_experiments (folder, n_grams): results_files = get_results_filenames(folder) data = helpers.load_dataset(ds.dataset + ds.negate_dataset) experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams + "/next_negation_handled_experiments.csv") for results_filename in results_files: mpt = results_filename.split("_")[0] if mpt == "best": continue mpt = int(mpt) experiments = experiments_df[experiments_df.mpt == mpt] results_df = helpers.load_dataset(folder + results_filename) results_df = process_experiments(data, mpt, experiments, results_df, n_grams) helpers.dataframe_to_csv(results_df, folder + results_filename) return
def get_existing_results(folder, dataset_type, n_grams): experiments = [] for file in get_results_filenames(folder): mpt = file.split("_")[0] if mpt == "best": continue mpt = int(mpt) results_df = helpers.load_dataset(folder + file) results_df = results_df.sort_values( ['weighted_avg_f1-score'], ascending=False).groupby('algorithm').head(3) results_df = results_df.reset_index(drop=True) algorithms = algorithm_single_list(results_df.algorithm.tolist()) for algorithm in algorithms: relevant_rows = results_df[results_df.algorithm == algorithm] for index, row in relevant_rows.iterrows(): experiments = next_experiments(mpt, algorithm, row.hyperparameter, experiments) break new_experiments_df = pd.DataFrame( experiments, columns=["mpt", "algorithm", "hyperparameter"]) helpers.dataframe_to_csv( new_experiments_df, "/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams + "/next_" + dataset_type + "_experiments.csv")
def load_tagged_keywords(): tagged_keywords = helpers.load_dataset(ds.output_data + "keywords/keywords_tagged.csv") tagged_keywords["split"] = "" for index, row in tagged_keywords.iterrows(): tagged_keywords.split.at[index] = keyword_splitter_to_list(row.term) return tagged_keywords
def __init__(self, experiment_name, vizualize, num_epochs, n_observations): # Create Experiment name dir for records self.experiment_name = experiment_name self.n_observations = n_observations self.viz = create_viz('{}_{}'.format( name_env, self.experiment_name)) if vizualize else None self.dataset, self.dataloader, self.device = load_dataset( self.viz, folder_name=self.experiment_name) self.netG = Generator(ngpu).to(self.device) self.netD = Discriminator(ngpu).to(self.device) self.start_epoch = self.filehandling_experiment() self.num_epochs = num_epochs # We create a fixed subset of random for the latent variable, this way we can evauate our progress. self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device) # Setup Adam optimizers for both G and D self.optimizerD = optim.RMSprop(self.netD.parameters(), lr=lr) self.optimizerG = optim.RMSprop(self.netG.parameters(), lr=lr) # Fixed noise for visualisation self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device)
def __init__(self): self.dataset, self.dataloader, self.device = load_dataset() self.netG = Generator(ngpu).to(self.device) self.netD = Discriminator(ngpu).to(self.device) # Initialise Weights self.netG.apply(weights_init) self.netD.apply(weights_init) # define loss function self.criterion = nn.BCELoss() # We create a fixed subset of random for the latent variable, this way we can evauate our progress. self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device) # Establish convention for real and fake labels during training self.real_label = 1 self.fake_label = 0 # Setup Adam optimizers for both G and D self.optimizerD = optim.Adam(self.netD.parameters(), lr=lr, betas=(beta1, 0.999)) self.optimizerG = optim.Adam(self.netG.parameters(), lr=lr, betas=(beta1, 0.999)) # Fixed noise for visualisation self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device)
def import_best_results_and_sort(folder): best_results_df = helpers.load_dataset(folder + "best_result_per_emotion.csv") best_results_df = best_results_df.sort_values(['macro_avg_f1-score'], ascending=False) best_results_df = best_results_df.reset_index(drop=True) helpers.dataframe_to_csv(best_results_df, folder + "best_result_per_emotion_sorted.csv")
def tweet_extractor(): files_created_generic = [] files_created_specific = [] for file in ds.all_datasets: generic_df = helpers.load_dataset(ds.output_data + "actual_keyword_matches/generic/" + file) specific_df = helpers.load_dataset(ds.output_data + "actual_keyword_matches/specific/" + file) print(" - loading data", file) df = helpers.load_dataset(ds.dataset + file) df = df[df.tweet_language == "en"] columns = [] for h in df.head(): columns.append(h) columns.append("matches") columns.append("source_file") columns.append("month") columns.append("year") df["matches"] = "" df["source_file"] = "" df["tweet_time"] = df["tweet_time"].astype("datetime64") df["month"] = df["tweet_time"].dt.month df["year"] = df["tweet_time"].dt.year specific_tweets, generic_tweets = pd.DataFrame( columns=columns), pd.DataFrame(columns=columns) specific_tweets = match_extractor(specific_df, df, specific_tweets, file, "specific") generic_tweets = match_extractor(generic_df, df, generic_tweets, file, "generic") output_data_path = ds.output_data + "first_dataset_extraction/" dataset = file.split("/")[0] filename = file.split("/")[1] specific_path = output_data_path + "specific/" + dataset + "/" helpers.path_checker(specific_path) helpers.dataframe_to_csv(specific_tweets, specific_path + filename) files_created_specific.append(specific_path + filename) generic_path = output_data_path + "generic/" + dataset + "/" helpers.path_checker(generic_path) helpers.dataframe_to_csv(generic_tweets, generic_path + filename) files_created_generic.append(generic_path + filename) return files_created_generic, files_created_specific
def run(x_key, y_key, start_from: int = 0, window: int = 14, epochs: int = 100, optimizer="adam"): x_data, x_mean, x_std = normalize_dataset(load_dataset(x_key)[start_from:]) y_data, y_mean, y_std = normalize_dataset(load_dataset(y_key)[start_from:]) x, y1, y2 = create_dataset_multi_step(x_data, y_data, window) if isinstance(optimizer, keras.optimizers.Optimizer): optimizer_name = optimizer.get_config()["name"] else: optimizer_name = optimizer filename = f"{x_key}_{y_key}_{window}_{optimizer_name}" checkpoint = keras.callbacks.ModelCheckpoint( f"weights_{filename}.hdf5", monitor="loss", verbose=True, save_best_only=True, ) callback = keras.callbacks.EarlyStopping(patience=20, monitor="loss", verbose=True, restore_best_weights=True) model = create_multi_model(x.shape[-1], y2.shape[-1], optimizer, name=filename) history = model.fit(x=x, y=[y1, y2], epochs=epochs, batch_size=x.shape[0], callbacks=[callback, checkpoint]) plot_training_history_with_validation(history, filename) _, y_pred = predict_multi_values(model, x_data, y_data, 28) plot_predicted(y_pred * y_std + y_mean, START_DATE.shift(days=start_from), f"graph_{filename}.png")
def load_data(): """ Формирование обучающего, тренировочного и тестового массива изображений. Вспомогательный файл helpers.py формирует массив изображений по заданному пути. Выходные данные: IMAGE_LIST - массив тренировочных изображений TEST_IMAGE_LIST - массив тестовых изображений VALIDATION_IMAGE_LIST - массив валидационных изображений (по этому массиву осуществляется проверка работы алгоритма """ IMAGE_DIR_TRAINING = "data/training/" IMAGE_DIR_TEST = "data/test/" IMAGE_DIR_VALIDATION = "data/val/" IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TRAINING) TEST_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TEST) VALIDATION_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_VALIDATION) return IMAGE_LIST, TEST_IMAGE_LIST, VALIDATION_IMAGE_LIST
def single_list_generator(): df = helpers.load_dataset("original_keywords.csv") list_of_terms = df.keywords.tolist() individual_terms = word_extractor(list_of_terms) list_df = pd.DataFrame(individual_terms, columns=["keyword"]) output_dir = ds.output_data + "keywords/" helpers.path_checker(output_dir) output_file = output_dir + "keywords_single_list.csv" helpers.dataframe_to_csv(list_df, output_file) return output_file
def run(x_key, y_key, start_from: int = 0, window: int = 14, validation_split=.0): x_data = load_dataset(x_key)[start_from:] y_data = load_dataset(y_key)[start_from:] x_data, x_mean, x_std = normalize_dataset(x_data) y_data, y_mean, y_std = normalize_dataset(y_data) x, y = create_simple_dataset(x_data, y_data, window) checkpoint = keras.callbacks.ModelCheckpoint( f"{x_key}_{y_key}_{window}_simple.hdf5", monitor="loss", verbose=True, save_best_only=True, ) callback = keras.callbacks.EarlyStopping(patience=20, monitor="loss", verbose=True, restore_best_weights=True) model = create_simple_model(x.shape[-1], y.shape[-1]) history = model.fit( x=x, y=y, validation_split=validation_split, epochs=100, batch_size=x.shape[0], callbacks=[callback], ) plot_training_history_with_validation( history, with_validation=bool(validation_split)) if x.shape[-1] == y.shape[-1]: y_pred = predict_values(model, x_data, 28) plot_predicted(y_pred * y_std + y_mean, START_DATE.shift(days=start_from), f"{x_key}_{y_key}_{window}_simple.png")
def tagged_keywords_generator(): df = helpers.load_dataset(ds.output_data + "keywords/original_keywords.csv") for item in df.iterrows(): list_of_terms.append(item[1][0]) tagged_terms = term_tagger(list_of_terms) output_dir = ds.output_data + "keywords/" helpers.path_checker(output_dir) output_file = output_dir + "keywords_tagged.csv" helpers.data_to_file_two_values(tagged_terms, '"term","tag"', output_file) return output_file
def import_results (folder): new_results = [] files = get_results_filenames(folder) for file in files: print("---" + file + "---") mpt = file.split("_")[0] if mpt == "best": continue mpt = int(mpt) results_df = helpers.load_dataset(folder + file) results_df = results_df.sort_values(['weighted_avg_f1-score'],ascending=False) results_df = results_df.reset_index(drop=True) for index, row in results_df.iterrows(): new_results.append([mpt, row.algorithm, row.hyperparameter, row.weighted_avg_precision, row.weighted_avg_recall, row["weighted_avg_f1-score"], row.accuracy, row.experiment_type, row.metric_dump_id, row.positive_precision, row.positive_recall, row["positive_f1-score"], row.neutral_precision, row.neutral_recall, row["neutral_f1-score"], row.negative_precision, row.negative_recall, row["negative_f1-score"]]) break columns = ["mpt", "algorithm", "hyperparameter", "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1-score", "accuracy", "experiment_type", "metric_dump_id", "positive_precision", "positive_recall", "positive_f1-score", "neutral_precision", "neutral_recall", "neutral_f1-score", "negative_precision", "negative_recall", "negative_f1-score"] new_results_df = pd.DataFrame(new_results, columns=columns) helpers.dataframe_to_csv(new_results_df, folder + "best_result_per_mpt.csv")
def tst_brightly_colored_pixels(img_number, dataset): if dataset == 'test': raw_images = helpers.load_dataset(IMAGE_DIR_TEST) elif dataset == 'training': raw_images = helpers.load_dataset(IMAGE_DIR_TRAINING) images = standardize(raw_images) im = images[img_number][0] (top_img, mid_img, bottom_img) = split_image_horizontally(im) (masked_red, total_nonnull_red, total_amount_red) = mask_for_bright_pixels(top_img, 'red', True) (masked_yellow, total_nonnull_yellow, total_amount_yellow) = mask_for_bright_pixels(mid_img, 'yellow', True) (masked_green, total_nonnull_green, total_amount_green) = mask_for_bright_pixels(bottom_img, 'green', True) color = create_brightly_colored_pixels_feature(im, [0, 0, 0], True) f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 10)) ax1.imshow(im) ax2.imshow(masked_red) ax3.imshow(masked_yellow) ax4.imshow(masked_green) print("Detected by brigth pixels: ", color)
def merge(dataset_type): print(" - Processing " + dataset_type + " files:") for file in ds.all_datasets: print(" - " + file) file_path = file.split("/") f_name = ds.output_data + "first_dataset_extraction/" + dataset_type + "/" + file_path[0] + "/" + file_path[1] df = helpers.load_dataset(f_name) if file == ds.all_datasets[0]: merge_hold = df else: merge_hold = pd.concat([merge_hold, df], sort=False) output_path = ds.output_data + "merged_dataset_extraction/" helpers.path_checker(output_path) file_name = dataset_type + ".csv" helpers.dataframe_to_csv(merge_hold, output_path + file_name) return output_path + file_name
def prepare_timestamp(): dataset_timestamp = load_dataset( 'study_data_windowed/study_data_windowed_muse_30_s.gzip.pkl') # prepare data - remove inimportant aptributes for n in range(22, 65): for m in range(0, 11): del dataset_timestamp[n]['data'][m]['Delta_TP9'] del dataset_timestamp[n]['data'][m]['Delta_AF7'] del dataset_timestamp[n]['data'][m]['Delta_AF8'] del dataset_timestamp[n]['data'][m]['Delta_TP10'] del dataset_timestamp[n]['data'][m]['Theta_TP9'] del dataset_timestamp[n]['data'][m]['Theta_AF7'] del dataset_timestamp[n]['data'][m]['Theta_AF8'] del dataset_timestamp[n]['data'][m]['Theta_TP10'] del dataset_timestamp[n]['data'][m]['Alpha_TP9'] del dataset_timestamp[n]['data'][m]['Alpha_AF7'] del dataset_timestamp[n]['data'][m]['Alpha_AF8'] del dataset_timestamp[n]['data'][m]['Alpha_TP10'] del dataset_timestamp[n]['data'][m]['Beta_TP9'] del dataset_timestamp[n]['data'][m]['Beta_AF7'] del dataset_timestamp[n]['data'][m]['Beta_AF8'] del dataset_timestamp[n]['data'][m]['Beta_TP10'] del dataset_timestamp[n]['data'][m]['Gamma_TP9'] del dataset_timestamp[n]['data'][m]['Gamma_AF7'] del dataset_timestamp[n]['data'][m]['Gamma_AF8'] del dataset_timestamp[n]['data'][m]['Gamma_TP10'] del dataset_timestamp[n]['data'][m]['RAW_TP10'] del dataset_timestamp[n]['data'][m]['RAW_AF7'] del dataset_timestamp[n]['data'][m]['RAW_AF8'] del dataset_timestamp[n]['data'][m]['RAW_TP9'] del dataset_timestamp[n]['data'][m]['Accelerometer_X'] del dataset_timestamp[n]['data'][m]['Accelerometer_Y'] del dataset_timestamp[n]['data'][m]['Accelerometer_Z'] del dataset_timestamp[n]['data'][m]['AUX_RIGHT'] del dataset_timestamp[n]['data'][m]['Gyro_X'] del dataset_timestamp[n]['data'][m]['Gyro_Y'] del dataset_timestamp[n]['data'][m]['Gyro_Z'] del dataset_timestamp[n]['data'][m]['HSI_TP9'] del dataset_timestamp[n]['data'][m]['HSI_TP10'] del dataset_timestamp[n]['data'][m]['HSI_AF7'] del dataset_timestamp[n]['data'][m]['HSI_AF8'] del dataset_timestamp[n]['data'][m]['HeadBandOn'] del dataset_timestamp[n]['data'][m]['Battery'] return dataset_timestamp
def dataset_processing(): global score_counter global df df = helpers.load_dataset(ds.dataset) df['sentiment_class'] = "" df['positive_score'] = "" df['negative_score'] = "" df['objective_score'] = "" df['stemmed_preprocessed_text'] = "" df['words_matched_percentage'] = "" word_dict = {} pos_dict = {} for index, row in df.iterrows(): if index % 100 == 0: print(" -", str(index), "/", str(len(df))) stemmed_preprocessed_text = [] synsets = [] tweet_text = str(row.preprocessed_tweet_text) tweet_text = word_tokenize(tweet_text) words_with_pos = pos_tag(tweet_text) for word, pos in words_with_pos: word_synset = synset_matching(word, pos_tag_conversion(pos), tweet_text, row.tweet_text) if word_synset != None: synsets.append(word_synset) stemmed_preprocessed_text.append(stemming(word)) if len(synsets) > 0: sent_class, pos_score, neg_score, obj_score = sentiwordnet_processing( synsets) df.sentiment_class.at[index] = sent_class df.positive_score.at[index] = pos_score df.negative_score.at[index] = neg_score df.objective_score.at[index] = obj_score stemmed_preprocessed_text = " ".join(stemmed_preprocessed_text) df.stemmed_preprocessed_text.at[index] = stemmed_preprocessed_text if len(tweet_text) != 0: df.words_matched_percentage.at[index] = round( 100 * len(synsets) / len(tweet_text), 2) else: df.words_matched_percentage.at[index] = 0 for ix in score_counter: print(ix, score_counter[ix]) return df
def date_selection(): output_files = [] path = ds.output_data + "merged_dataset_extraction/" files = helpers.path_fetcher(path) for file in files: df = helpers.load_dataset(path + file) df_2013 = df[df.year == 2013] df_2013_8 = df_2013[df.month == 8] df_2013_9 = df_2013[df.month == 9] df_2013_10 = df_2013[df.month == 10] df_2013_11 = df_2013[df.month == 11] df_2013_12 = df_2013[df.month == 12] df = df[(df.year == 2014) | (df.year == 2015) | (df.year == 2016) | (df.year == 2017) | (df.year == 2018)] df = pd.concat( [df_2013_8, df_2013_9, df_2013_10, df_2013_11, df_2013_12, df]) storage_path = ds.output_data + "time_filtered_dataset_extraction/" helpers.path_checker(storage_path) helpers.dataframe_to_csv(df, storage_path + file) output_files.append(storage_path + file) return output_files
def import_results(folder): new_results = [] files = get_results_filenames(folder) for file in files: print("---" + file + "---") emotion = file.split("_")[0] if emotion == "best": continue results_df = helpers.load_dataset(folder + file) results_df = results_df.sort_values(['macro_avg_f1-score'], ascending=False) results_df = results_df.reset_index(drop=True) helpers.dataframe_to_csv(results_df, folder + file) for index, row in results_df.iterrows(): new_results.append([ emotion, row.algorithm, row.hyperparameter, row.weighted_avg_precision, row.weighted_avg_recall, row["weighted_avg_f1-score"], row.accuracy, row.experiment_type, row.metric_dump_id, row.macro_avg_precision, row.macro_avg_recall, row["macro_avg_f1-score"], row[emotion + "_precision"], row[emotion + "_recall"], row[emotion + "_f1-score"], row["no_" + emotion + "_precision"], row["no_" + emotion + "_recall"], row["no_" + emotion + "_f1-score"] ]) break columns = [ "emotion", "algorithm", "hyperparameter", "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1-score", "accuracy", "experiment_type", "metric_dump_id", "macro_avg_precision", "macro_avg_recall", "macro_avg_f1-score", emotion + "_precision", emotion + "_recall", emotion + "_f1-score", "no_" + emotion + "_precision", "no_" + emotion + "_recall", "no_" + emotion + "_f1-score" ] new_results_df = pd.DataFrame(new_results, columns=columns) helpers.dataframe_to_csv(new_results_df, folder + "best_result_per_emotion.csv")
def prepare_data(): dataset_raw = load_dataset('data.gzip.pkl') for n in range(22, 65): for m in range(0, 11): del dataset_raw[n]['data'][m]['time_stamp'] del dataset_raw[n]['data'][m]['Accelerometer_X'] del dataset_raw[n]['data'][m]['Accelerometer_Y'] del dataset_raw[n]['data'][m]['Accelerometer_Z'] del dataset_raw[n]['data'][m]['AUX_RIGHT'] del dataset_raw[n]['data'][m]['Gyro_X'] del dataset_raw[n]['data'][m]['Gyro_Y'] del dataset_raw[n]['data'][m]['Gyro_Z'] del dataset_raw[n]['data'][m]['RAW_TP9'] del dataset_raw[n]['data'][m]['RAW_TP10'] del dataset_raw[n]['data'][m]['RAW_AF7'] del dataset_raw[n]['data'][m]['RAW_AF8'] del dataset_raw[n]['data'][m]['HSI_TP9'] del dataset_raw[n]['data'][m]['HSI_TP10'] del dataset_raw[n]['data'][m]['HSI_AF7'] del dataset_raw[n]['data'][m]['HSI_AF8'] del dataset_raw[n]['data'][m]['HeadBandOn'] del dataset_raw[n]['data'][m]['Battery'] return dataset_raw
import cv2 # computer vision library import helpers import numpy as np import matplotlib.pyplot as plt import matplotlib.image as mpimg # Image data directories image_dir_training = "day_night_images/training/" image_dir_test = "day_night_images/test/" # Using the load_dataset function in helpers.py # Load training data IMAGE_LIST = helpers.load_dataset(image_dir_training) # Standardize all training images STANDARDIZED_LIST = helpers.standardize(IMAGE_LIST) # Display a standardized image and its label # Select an image by index image_num = 0 selected_image = STANDARDIZED_LIST[image_num][0] selected_label = STANDARDIZED_LIST[image_num][1] # Display image and data about it # plt.imshow(selected_image) # print("Shape: "+str(selected_image.shape)) # print("Label [1 = day, 0 = night]: " + str(selected_label)) # Find the average Value or brightness of an image
# ## Load the datasets # # These first few lines of code will load the training traffic light images and store all of them in a variable, `IMAGE_LIST`. This list contains the images and their associated label ("red", "yellow", "green"). # # You are encouraged to take a look at the `load_dataset` function in the helpers.py file. This will give you a good idea about how lots of image files can be read in from a directory using the [glob library](https://pymotw.com/2/glob/). The `load_dataset` function takes in the name of an image directory and returns a list of images and their associated labels. # # For example, the first image-label pair in `IMAGE_LIST` can be accessed by index: # ``` IMAGE_LIST[0][:]```. # # In[3]: # Using the load_dataset function in helpers.py # Load training data IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TRAINING) # ## Visualize the Data # # The first steps in analyzing any dataset are to 1. load the data and 2. look at the data. Seeing what it looks like will give you an idea of what to look for in the images, what kind of noise or inconsistencies you have to deal with, and so on. This will help you understand the image dataset, and **understanding a dataset is part of making predictions about the data**. # --- # ### Visualize the input images # # Visualize and explore the image data! Write code to display an image in `IMAGE_LIST`: # * Display the image # * Print out the shape of the image # * Print out its corresponding label # # See if you can display at least one of each type of traffic light image – red, green, and yellow — and look at their similarities and differences.
print('Getting model...') result = None with open(os.path.join(model_path, 'train.pickle'), 'rb') as f: result = pickle.load(f) label_probs = result[0] probs_per_label = result[1] words = result[2] labels = result[3] predictor = partial(helpers.predict, label_probs, probs_per_label, words, labels) if os.path.isdir(input_path): print('Loading dataset...') test_target, test_data = helpers.load_dataset(input_path) print('Testing dataset...') print("============= RESULT ===============") accuracy = helpers.get_accuracy(test_data, test_target, predictor) print('Accuracy: {0:.2f}%'.format(accuracy * 100)) print('=====================================') else: print('Testing...') label, tokens = helpers.load_file(input_path) predict_label = predictor(tokens) print('================= RESULT ================') print('Expected label for the text: {}'.format(label)) print('Predicted label for the text: {}'.format(predict_label)) print('==========================================')
urllib.request.urlretrieve(url, file_path) print('+ %s' % url) except IOError as e: print('%s - %s' % (url, e), file=sys.stderr) def url_to_file_name(url): if url: file_name = sha256(url.encode('utf-8')).hexdigest() extension = url.split('.')[-1] if len(extension) > 4: return file_name else: return '%s.%s' % (file_name, extension) if not os.path.exists(PATH): os.mkdir(PATH) dataset = load_dataset(CSV_PATH) dataset[URL_COLUMN] = dataset[URL_COLUMN].astype(np.str).replace({'nan': None}) dataset['file_names'] = dataset[URL_COLUMN].map(url_to_file_name) already_downloaded = dataset['file_names'].isin(os.listdir(PATH)) without_url = dataset[URL_COLUMN].isnull() remaining_images = dataset[~(already_downloaded | without_url)] print('Remaining: %i' % len(remaining_images)) for index, values in remaining_images.iterrows(): url = dict(values)[URL_COLUMN] file_path = '%s/%s' % (PATH, url_to_file_name(url)) time.sleep(1) download_image(url, file_path)
import flickrapi from helpers import flickr_api, load_dataset import pandas as pd import sys ids_dataset_path = 'datasetFlickrID.txt' urls_dataset_path = 'psed_images.csv' file_headers = ['photo_id', 'url'] flickr = flickr_api('config.ini') ids_dataset = load_dataset(ids_dataset_path, ['photo_id']) urls_dataset = load_dataset(urls_dataset_path, file_headers) remaining_images = ids_dataset[~ids_dataset['photo_id'].isin(urls_dataset['photo_id'])] for index, photo in remaining_images.iterrows(): url = None photo_id = str(photo['photo_id']) try: available_sizes = flickr.photos.getSizes(photo_id=photo_id) url = available_sizes['sizes']['size'][-1]['source'] print('+ %s' % photo_id) except flickrapi.exceptions.FlickrError as e: print('%s - %s' % (photo_id, e), file=sys.stderr) row = pd.Series([photo_id, url], index=file_headers) urls_dataset = urls_dataset.append(row, ignore_index=True) urls_dataset.to_csv(urls_dataset_path, encoding='utf-8', index=False)
def main(model='mlp', num_epochs=500): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") if model == 'mlp': network = build_mlp(input_var) elif model.startswith('custom_mlp:'): depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') network = build_custom_mlp(input_var, int(depth), int(width), float(drop_in), float(drop_hid)) elif model == 'cnn': network = build_cnn(input_var) else: print("Unrecognized model type %r." % model) return # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) # Optionally, you could now dump the network weights to a file like this: np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) with open('model.dpkl','wb') as p_output: pickle.dump(network, p_output)
def is_direct_link(url): extensions = ['.gif', '.jpg', '.png'] regexp = '|'.join([r'(?:\%s\?[\w=&;]+)' % ext for ext in extensions]) return url[-4:].lower() in extensions or \ re.search(regexp, url, flags=re.IGNORECASE) def extract_image_url(url): if is_direct_link(url): return url else: return imgur_url(url) or \ gfycat_url(url) or \ flickr_url(url) flickr = flickr_api('config.ini') reddit_dataset = load_dataset(CSV_PATH) reddit_dataset = reddit_dataset[~reddit_dataset['is_self']] reddit_dataset['image_url'] = reddit_dataset['url'].map(extract_image_url) skipped_rows = reddit_dataset[reddit_dataset['image_url'].isnull()]['url'] print('--- Skipping %i rows' % len(skipped_rows)) if len(skipped_rows): print(skipped_rows) urls_dataset_path = 'reddit_psed_images.csv' file_headers = ['csv', 'url'] urls_dataset = load_dataset(urls_dataset_path, file_headers) new_images = ~(reddit_dataset['image_url'].isin(urls_dataset['url']) | \ reddit_dataset['image_url'].isnull()) urls_to_include = pd.DataFrame() urls_to_include[['permalink', 'url']] = reddit_dataset[new_images][['permalink', 'image_url']]