Beispiel #1
0
def processing():
    keyword_list = helpers.load_dataset(ds.output_data +
                                        "keywords/keywords_single_list.csv")
    store = {}
    keyword_list = list_creator(keyword_list)
    file_paths = []

    for df in ds.all_datasets:
        print("    - Processing", df)
        f_name = df
        store[f_name] = {}
        df = helpers.load_dataset(ds.dataset + df)
        df = df[df.tweet_language == "en"]
        for index, row in df.iterrows():
            matches = check_keyword(clean_tweet(row.tweet_text), keyword_list)
            if len(matches) != 0:
                store[f_name][row.tweetid] = matches
    # # storage
    matches_counter = 0
    for f_name in store:
        data_list = []
        filename = f_name.split("/")
        dataset = filename[0]
        filename = filename[1]
        path = ds.output_data + "individual_keyword_matches/"
        dataset_path = path + dataset + "/"
        helpers.path_checker(dataset_path)
        file_path = dataset_path + filename
        for item in store[f_name]:
            data_list.append([item, store[f_name][item]])
            matches_counter += 1
        helpers.data_to_file_two_values(data_list, '"tweet_id","matches"',
                                        file_path)
        file_paths.append(file_path)
    return file_paths
Beispiel #2
0
def get_undetected_red(dataset='test'):
    if dataset == 'test':
        raw_images = helpers.load_dataset(IMAGE_DIR_TEST)
    elif dataset == 'training':
        raw_images = helpers.load_dataset(IMAGE_DIR_TRAINING)
    images = standardize(raw_images)
    und_red = []

    # Iterate through all the test images
    # Classify each image and compare to the true label
    for i in range(len(images)):
        image = images[i]
        # Get true data
        im = image[0]
        true_label = image[1]
        predicted_label = estimate_label(im)
        if (predicted_label != true_label):
            # If these labels are not equal, the image has been misclassified
            if predicted_label == [0, 0, 1] and true_label == [1, 0, 0]:
                und_red.append((im, predicted_label, true_label, i))

    # Accuracy calculations
    total = len(images)

    print("Number of undetected red in ", dataset, " = ", len(und_red),
          ' out of ', len(images))
    for i in und_red:
        print("\tImage num ", i[3], " in dataset ", dataset,
              " false detected ", i[1])

    return und_red
Beispiel #3
0
def load_data():
    """Загрзка данных тренироваочные, валидационные, тестовые"""

    try:
        IMAGE_DIR_TRAIN = "data/training"
        TRAIN_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TRAIN)
        TRAIN_OBJECTS_LIST = helpers.load_objects(TRAIN_IMAGE_LIST,
                                                  IMAGE_DIR_TRAIN)
    except Exception:
        TRAIN_OBJECTS_LIST = []

    try:
        IMAGE_DIR_TEST = "data/test"
        TEST_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TEST)
        TEST_OBJECTS_LIST = helpers.load_objects(TEST_IMAGE_LIST,
                                                 IMAGE_DIR_TEST)
    except Exception:
        TEST_OBJECTS_LIST = []

    try:
        IMAGE_DIR_VAL = "data/val"
        VAL_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_VAL)
        VAL_OBJECTS_LIST = helpers.load_objects(VAL_IMAGE_LIST, IMAGE_DIR_VAL)
    except Exception:
        VAL_OBJECTS_LIST = []

    return TRAIN_OBJECTS_LIST, TEST_OBJECTS_LIST, VAL_OBJECTS_LIST
def negation_handled (folder, n_grams):
    data = helpers.load_dataset(ds.dataset + ds.negate_dataset)
    results_files = get_results_filenames(folder)
    for results_file in results_files:
        mpt = results_file.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        results_df = helpers.load_dataset(folder + results_file)
        results_df = get_first_experimental_results(data, mpt, results_df, n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_file)
def execute(folder, n_grams):
    results_files = get_results_filenames(folder)
    data = helpers.load_dataset(ds.dataset + ds.file)
    for results_file in results_files:
        emotion = results_file.split("_")[0]
        if emotion == "best":
            continue
        results_df = helpers.load_dataset(folder + results_file)
        results_df = get_first_experimental_results(data, emotion, results_df,
                                                    n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_file)
def execute (folder, n_grams):
    results_files = get_results_filenames(folder)
    data = helpers.load_dataset(ds.dataset + ds.file)
    experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/emotion_detection/" + n_grams + "/next_experiments.csv")
    for results_filename in results_files:
        emotion = results_filename.split("_")[0]
        if emotion == "best":
            continue
        experiments = experiments_df[experiments_df.emotion == emotion]
        results_df = helpers.load_dataset(folder + results_filename)
        results_df = process_experiments(data, emotion, experiments, results_df, n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_filename)
    return
Beispiel #7
0
def tst_estimate_label(im_num, dataset='test'):
    if dataset == 'test':
        raw_images = helpers.load_dataset(IMAGE_DIR_TEST)
    elif dataset == 'training':
        raw_images = helpers.load_dataset(IMAGE_DIR_TRAINING)
    images = standardize(raw_images)

    image = images[im_num]
    im = image[0]
    true_label = image[1]
    predicted_label = estimate_label(im)

    print(predicted_label)
def process_negation_handled_experiments (folder, n_grams):
    results_files = get_results_filenames(folder)
    data = helpers.load_dataset(ds.dataset + ds.negate_dataset)
    experiments_df = helpers.load_dataset("/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams + "/next_negation_handled_experiments.csv")
    for results_filename in results_files:
        mpt = results_filename.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        experiments = experiments_df[experiments_df.mpt == mpt]
        results_df = helpers.load_dataset(folder + results_filename)
        results_df = process_experiments(data, mpt, experiments, results_df, n_grams)
        helpers.dataframe_to_csv(results_df, folder + results_filename)
    return
Beispiel #9
0
def get_existing_results(folder, dataset_type, n_grams):
    experiments = []
    for file in get_results_filenames(folder):
        mpt = file.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        results_df = helpers.load_dataset(folder + file)
        results_df = results_df.sort_values(
            ['weighted_avg_f1-score'],
            ascending=False).groupby('algorithm').head(3)
        results_df = results_df.reset_index(drop=True)
        algorithms = algorithm_single_list(results_df.algorithm.tolist())
        for algorithm in algorithms:
            relevant_rows = results_df[results_df.algorithm == algorithm]
            for index, row in relevant_rows.iterrows():
                experiments = next_experiments(mpt, algorithm,
                                               row.hyperparameter, experiments)
                break
    new_experiments_df = pd.DataFrame(
        experiments, columns=["mpt", "algorithm", "hyperparameter"])
    helpers.dataframe_to_csv(
        new_experiments_df,
        "/home/michael/MRes/actual_project/sentiment_analysis/" + n_grams +
        "/next_" + dataset_type + "_experiments.csv")
def load_tagged_keywords():
    tagged_keywords = helpers.load_dataset(ds.output_data +
                                           "keywords/keywords_tagged.csv")
    tagged_keywords["split"] = ""
    for index, row in tagged_keywords.iterrows():
        tagged_keywords.split.at[index] = keyword_splitter_to_list(row.term)
    return tagged_keywords
    def __init__(self, experiment_name, vizualize, num_epochs, n_observations):
        # Create Experiment name dir for records
        self.experiment_name = experiment_name
        self.n_observations = n_observations

        self.viz = create_viz('{}_{}'.format(
            name_env, self.experiment_name)) if vizualize else None

        self.dataset, self.dataloader, self.device = load_dataset(
            self.viz, folder_name=self.experiment_name)

        self.netG = Generator(ngpu).to(self.device)
        self.netD = Discriminator(ngpu).to(self.device)

        self.start_epoch = self.filehandling_experiment()
        self.num_epochs = num_epochs

        # We create a fixed subset of random for the latent variable, this way we can evauate our progress.
        self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device)

        # Setup Adam optimizers for both G and D
        self.optimizerD = optim.RMSprop(self.netD.parameters(), lr=lr)
        self.optimizerG = optim.RMSprop(self.netG.parameters(), lr=lr)

        # Fixed noise for visualisation
        self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device)
    def __init__(self):
        self.dataset, self.dataloader, self.device = load_dataset()

        self.netG = Generator(ngpu).to(self.device)
        self.netD = Discriminator(ngpu).to(self.device)

        # Initialise Weights
        self.netG.apply(weights_init)
        self.netD.apply(weights_init)

        # define loss function
        self.criterion = nn.BCELoss()

        # We create a fixed subset of random for the latent variable, this way we can evauate our progress.
        self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device)

        # Establish convention for real and fake labels during training
        self.real_label = 1
        self.fake_label = 0

        # Setup Adam optimizers for both G and D
        self.optimizerD = optim.Adam(self.netD.parameters(),
                                     lr=lr,
                                     betas=(beta1, 0.999))
        self.optimizerG = optim.Adam(self.netG.parameters(),
                                     lr=lr,
                                     betas=(beta1, 0.999))

        # Fixed noise for visualisation
        self.fixed_noise = torch.randn(64, nz, 1, 1, device=self.device)
def import_best_results_and_sort(folder):
    best_results_df = helpers.load_dataset(folder +
                                           "best_result_per_emotion.csv")
    best_results_df = best_results_df.sort_values(['macro_avg_f1-score'],
                                                  ascending=False)
    best_results_df = best_results_df.reset_index(drop=True)
    helpers.dataframe_to_csv(best_results_df,
                             folder + "best_result_per_emotion_sorted.csv")
Beispiel #14
0
def tweet_extractor():
    files_created_generic = []
    files_created_specific = []
    for file in ds.all_datasets:
        generic_df = helpers.load_dataset(ds.output_data +
                                          "actual_keyword_matches/generic/" +
                                          file)
        specific_df = helpers.load_dataset(ds.output_data +
                                           "actual_keyword_matches/specific/" +
                                           file)
        print("      - loading data", file)
        df = helpers.load_dataset(ds.dataset + file)
        df = df[df.tweet_language == "en"]
        columns = []
        for h in df.head():
            columns.append(h)
        columns.append("matches")
        columns.append("source_file")
        columns.append("month")
        columns.append("year")
        df["matches"] = ""
        df["source_file"] = ""
        df["tweet_time"] = df["tweet_time"].astype("datetime64")
        df["month"] = df["tweet_time"].dt.month
        df["year"] = df["tweet_time"].dt.year
        specific_tweets, generic_tweets = pd.DataFrame(
            columns=columns), pd.DataFrame(columns=columns)
        specific_tweets = match_extractor(specific_df, df, specific_tweets,
                                          file, "specific")
        generic_tweets = match_extractor(generic_df, df, generic_tweets, file,
                                         "generic")
        output_data_path = ds.output_data + "first_dataset_extraction/"
        dataset = file.split("/")[0]
        filename = file.split("/")[1]

        specific_path = output_data_path + "specific/" + dataset + "/"
        helpers.path_checker(specific_path)
        helpers.dataframe_to_csv(specific_tweets, specific_path + filename)
        files_created_specific.append(specific_path + filename)

        generic_path = output_data_path + "generic/" + dataset + "/"
        helpers.path_checker(generic_path)
        helpers.dataframe_to_csv(generic_tweets, generic_path + filename)
        files_created_generic.append(generic_path + filename)
    return files_created_generic, files_created_specific
Beispiel #15
0
def run(x_key,
        y_key,
        start_from: int = 0,
        window: int = 14,
        epochs: int = 100,
        optimizer="adam"):
    x_data, x_mean, x_std = normalize_dataset(load_dataset(x_key)[start_from:])
    y_data, y_mean, y_std = normalize_dataset(load_dataset(y_key)[start_from:])

    x, y1, y2 = create_dataset_multi_step(x_data, y_data, window)

    if isinstance(optimizer, keras.optimizers.Optimizer):
        optimizer_name = optimizer.get_config()["name"]
    else:
        optimizer_name = optimizer

    filename = f"{x_key}_{y_key}_{window}_{optimizer_name}"
    checkpoint = keras.callbacks.ModelCheckpoint(
        f"weights_{filename}.hdf5",
        monitor="loss",
        verbose=True,
        save_best_only=True,
    )
    callback = keras.callbacks.EarlyStopping(patience=20,
                                             monitor="loss",
                                             verbose=True,
                                             restore_best_weights=True)

    model = create_multi_model(x.shape[-1],
                               y2.shape[-1],
                               optimizer,
                               name=filename)
    history = model.fit(x=x,
                        y=[y1, y2],
                        epochs=epochs,
                        batch_size=x.shape[0],
                        callbacks=[callback, checkpoint])
    plot_training_history_with_validation(history, filename)

    _, y_pred = predict_multi_values(model, x_data, y_data, 28)

    plot_predicted(y_pred * y_std + y_mean, START_DATE.shift(days=start_from),
                   f"graph_{filename}.png")
def load_data():
    """ Формирование обучающего, тренировочного и тестового массива изображений.
    Вспомогательный файл helpers.py формирует массив изображений по заданному пути.

    Выходные данные:
    IMAGE_LIST - массив тренировочных изображений
    TEST_IMAGE_LIST - массив тестовых изображений
    VALIDATION_IMAGE_LIST - массив валидационных изображений (по этому массиву осуществляется
    проверка работы алгоритма
    """

    IMAGE_DIR_TRAINING = "data/training/"
    IMAGE_DIR_TEST = "data/test/"
    IMAGE_DIR_VALIDATION = "data/val/"
    IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TRAINING)
    TEST_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TEST)
    VALIDATION_IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_VALIDATION)

    return IMAGE_LIST, TEST_IMAGE_LIST, VALIDATION_IMAGE_LIST
def single_list_generator():
    df = helpers.load_dataset("original_keywords.csv")
    list_of_terms = df.keywords.tolist()
    individual_terms = word_extractor(list_of_terms)
    list_df = pd.DataFrame(individual_terms, columns=["keyword"])
    output_dir = ds.output_data + "keywords/"
    helpers.path_checker(output_dir)
    output_file = output_dir + "keywords_single_list.csv"
    helpers.dataframe_to_csv(list_df, output_file)
    return output_file
Beispiel #18
0
def run(x_key,
        y_key,
        start_from: int = 0,
        window: int = 14,
        validation_split=.0):
    x_data = load_dataset(x_key)[start_from:]
    y_data = load_dataset(y_key)[start_from:]
    x_data, x_mean, x_std = normalize_dataset(x_data)
    y_data, y_mean, y_std = normalize_dataset(y_data)

    x, y = create_simple_dataset(x_data, y_data, window)

    checkpoint = keras.callbacks.ModelCheckpoint(
        f"{x_key}_{y_key}_{window}_simple.hdf5",
        monitor="loss",
        verbose=True,
        save_best_only=True,
    )
    callback = keras.callbacks.EarlyStopping(patience=20,
                                             monitor="loss",
                                             verbose=True,
                                             restore_best_weights=True)

    model = create_simple_model(x.shape[-1], y.shape[-1])

    history = model.fit(
        x=x,
        y=y,
        validation_split=validation_split,
        epochs=100,
        batch_size=x.shape[0],
        callbacks=[callback],
    )

    plot_training_history_with_validation(
        history, with_validation=bool(validation_split))

    if x.shape[-1] == y.shape[-1]:

        y_pred = predict_values(model, x_data, 28)
        plot_predicted(y_pred * y_std + y_mean,
                       START_DATE.shift(days=start_from),
                       f"{x_key}_{y_key}_{window}_simple.png")
def tagged_keywords_generator():
    df = helpers.load_dataset(ds.output_data +
                              "keywords/original_keywords.csv")
    for item in df.iterrows():
        list_of_terms.append(item[1][0])
    tagged_terms = term_tagger(list_of_terms)
    output_dir = ds.output_data + "keywords/"
    helpers.path_checker(output_dir)
    output_file = output_dir + "keywords_tagged.csv"
    helpers.data_to_file_two_values(tagged_terms, '"term","tag"', output_file)
    return output_file
def import_results (folder):
    new_results = []
    files = get_results_filenames(folder)
    for file in files:
        print("---" + file + "---")
        mpt = file.split("_")[0]
        if mpt == "best":
            continue
        mpt = int(mpt)
        results_df = helpers.load_dataset(folder + file)
        results_df = results_df.sort_values(['weighted_avg_f1-score'],ascending=False)
        results_df = results_df.reset_index(drop=True)
        for index, row in results_df.iterrows():
            new_results.append([mpt,
                                row.algorithm,
                                row.hyperparameter,
                                row.weighted_avg_precision,
                                row.weighted_avg_recall,
                                row["weighted_avg_f1-score"],
                                row.accuracy,
                                row.experiment_type,
                                row.metric_dump_id,
                                row.positive_precision,
                                row.positive_recall,
                                row["positive_f1-score"],
                                row.neutral_precision,
                                row.neutral_recall,
                                row["neutral_f1-score"],
                                row.negative_precision,
                                row.negative_recall,
                                row["negative_f1-score"]])
            break
    columns = ["mpt",
            "algorithm",
            "hyperparameter",
            "weighted_avg_precision",
            "weighted_avg_recall",
            "weighted_avg_f1-score",
            "accuracy",
            "experiment_type",
            "metric_dump_id",
            "positive_precision",
            "positive_recall",
            "positive_f1-score",
            "neutral_precision",
            "neutral_recall",
            "neutral_f1-score",
            "negative_precision",
            "negative_recall",
            "negative_f1-score"]
    new_results_df = pd.DataFrame(new_results, columns=columns)
    helpers.dataframe_to_csv(new_results_df, folder + "best_result_per_mpt.csv")
Beispiel #21
0
def tst_brightly_colored_pixels(img_number, dataset):
    if dataset == 'test':
        raw_images = helpers.load_dataset(IMAGE_DIR_TEST)
    elif dataset == 'training':
        raw_images = helpers.load_dataset(IMAGE_DIR_TRAINING)
    images = standardize(raw_images)

    im = images[img_number][0]
    (top_img, mid_img, bottom_img) = split_image_horizontally(im)

    (masked_red, total_nonnull_red,
     total_amount_red) = mask_for_bright_pixels(top_img, 'red', True)
    (masked_yellow, total_nonnull_yellow,
     total_amount_yellow) = mask_for_bright_pixels(mid_img, 'yellow', True)
    (masked_green, total_nonnull_green,
     total_amount_green) = mask_for_bright_pixels(bottom_img, 'green', True)

    color = create_brightly_colored_pixels_feature(im, [0, 0, 0], True)
    f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 10))
    ax1.imshow(im)
    ax2.imshow(masked_red)
    ax3.imshow(masked_yellow)
    ax4.imshow(masked_green)
    print("Detected by brigth pixels: ", color)
Beispiel #22
0
def merge(dataset_type):
    print("      - Processing " + dataset_type + " files:")
    for file in ds.all_datasets:
        print("        - " + file)
        file_path = file.split("/")
        f_name = ds.output_data + "first_dataset_extraction/" + dataset_type + "/" + file_path[0] + "/" + file_path[1]
        df = helpers.load_dataset(f_name)
        if file == ds.all_datasets[0]:
            merge_hold = df
        else:
            merge_hold = pd.concat([merge_hold, df], sort=False)
    output_path = ds.output_data + "merged_dataset_extraction/"
    helpers.path_checker(output_path)
    file_name = dataset_type + ".csv"
    helpers.dataframe_to_csv(merge_hold, output_path + file_name)
    return output_path + file_name
def prepare_timestamp():
    dataset_timestamp = load_dataset(
        'study_data_windowed/study_data_windowed_muse_30_s.gzip.pkl')

    # prepare data - remove inimportant aptributes
    for n in range(22, 65):
        for m in range(0, 11):
            del dataset_timestamp[n]['data'][m]['Delta_TP9']
            del dataset_timestamp[n]['data'][m]['Delta_AF7']
            del dataset_timestamp[n]['data'][m]['Delta_AF8']
            del dataset_timestamp[n]['data'][m]['Delta_TP10']
            del dataset_timestamp[n]['data'][m]['Theta_TP9']
            del dataset_timestamp[n]['data'][m]['Theta_AF7']
            del dataset_timestamp[n]['data'][m]['Theta_AF8']
            del dataset_timestamp[n]['data'][m]['Theta_TP10']
            del dataset_timestamp[n]['data'][m]['Alpha_TP9']
            del dataset_timestamp[n]['data'][m]['Alpha_AF7']
            del dataset_timestamp[n]['data'][m]['Alpha_AF8']
            del dataset_timestamp[n]['data'][m]['Alpha_TP10']
            del dataset_timestamp[n]['data'][m]['Beta_TP9']
            del dataset_timestamp[n]['data'][m]['Beta_AF7']
            del dataset_timestamp[n]['data'][m]['Beta_AF8']
            del dataset_timestamp[n]['data'][m]['Beta_TP10']
            del dataset_timestamp[n]['data'][m]['Gamma_TP9']
            del dataset_timestamp[n]['data'][m]['Gamma_AF7']
            del dataset_timestamp[n]['data'][m]['Gamma_AF8']
            del dataset_timestamp[n]['data'][m]['Gamma_TP10']
            del dataset_timestamp[n]['data'][m]['RAW_TP10']
            del dataset_timestamp[n]['data'][m]['RAW_AF7']
            del dataset_timestamp[n]['data'][m]['RAW_AF8']
            del dataset_timestamp[n]['data'][m]['RAW_TP9']
            del dataset_timestamp[n]['data'][m]['Accelerometer_X']
            del dataset_timestamp[n]['data'][m]['Accelerometer_Y']
            del dataset_timestamp[n]['data'][m]['Accelerometer_Z']
            del dataset_timestamp[n]['data'][m]['AUX_RIGHT']
            del dataset_timestamp[n]['data'][m]['Gyro_X']
            del dataset_timestamp[n]['data'][m]['Gyro_Y']
            del dataset_timestamp[n]['data'][m]['Gyro_Z']
            del dataset_timestamp[n]['data'][m]['HSI_TP9']
            del dataset_timestamp[n]['data'][m]['HSI_TP10']
            del dataset_timestamp[n]['data'][m]['HSI_AF7']
            del dataset_timestamp[n]['data'][m]['HSI_AF8']
            del dataset_timestamp[n]['data'][m]['HeadBandOn']
            del dataset_timestamp[n]['data'][m]['Battery']
    return dataset_timestamp
Beispiel #24
0
def dataset_processing():
    global score_counter
    global df
    df = helpers.load_dataset(ds.dataset)
    df['sentiment_class'] = ""
    df['positive_score'] = ""
    df['negative_score'] = ""
    df['objective_score'] = ""
    df['stemmed_preprocessed_text'] = ""
    df['words_matched_percentage'] = ""
    word_dict = {}
    pos_dict = {}
    for index, row in df.iterrows():
        if index % 100 == 0:
            print("    -", str(index), "/", str(len(df)))
        stemmed_preprocessed_text = []
        synsets = []
        tweet_text = str(row.preprocessed_tweet_text)
        tweet_text = word_tokenize(tweet_text)
        words_with_pos = pos_tag(tweet_text)
        for word, pos in words_with_pos:
            word_synset = synset_matching(word, pos_tag_conversion(pos),
                                          tweet_text, row.tweet_text)
            if word_synset != None:
                synsets.append(word_synset)
            stemmed_preprocessed_text.append(stemming(word))
        if len(synsets) > 0:
            sent_class, pos_score, neg_score, obj_score = sentiwordnet_processing(
                synsets)
            df.sentiment_class.at[index] = sent_class
            df.positive_score.at[index] = pos_score
            df.negative_score.at[index] = neg_score
            df.objective_score.at[index] = obj_score
        stemmed_preprocessed_text = " ".join(stemmed_preprocessed_text)
        df.stemmed_preprocessed_text.at[index] = stemmed_preprocessed_text
        if len(tweet_text) != 0:
            df.words_matched_percentage.at[index] = round(
                100 * len(synsets) / len(tweet_text), 2)
        else:
            df.words_matched_percentage.at[index] = 0

    for ix in score_counter:
        print(ix, score_counter[ix])
    return df
Beispiel #25
0
def date_selection():
    output_files = []
    path = ds.output_data + "merged_dataset_extraction/"
    files = helpers.path_fetcher(path)
    for file in files:
        df = helpers.load_dataset(path + file)
        df_2013 = df[df.year == 2013]
        df_2013_8 = df_2013[df.month == 8]
        df_2013_9 = df_2013[df.month == 9]
        df_2013_10 = df_2013[df.month == 10]
        df_2013_11 = df_2013[df.month == 11]
        df_2013_12 = df_2013[df.month == 12]
        df = df[(df.year == 2014) | (df.year == 2015) | (df.year == 2016) |
                (df.year == 2017) | (df.year == 2018)]
        df = pd.concat(
            [df_2013_8, df_2013_9, df_2013_10, df_2013_11, df_2013_12, df])
        storage_path = ds.output_data + "time_filtered_dataset_extraction/"
        helpers.path_checker(storage_path)
        helpers.dataframe_to_csv(df, storage_path + file)
        output_files.append(storage_path + file)
    return output_files
def import_results(folder):
    new_results = []
    files = get_results_filenames(folder)
    for file in files:
        print("---" + file + "---")
        emotion = file.split("_")[0]
        if emotion == "best":
            continue
        results_df = helpers.load_dataset(folder + file)
        results_df = results_df.sort_values(['macro_avg_f1-score'],
                                            ascending=False)
        results_df = results_df.reset_index(drop=True)
        helpers.dataframe_to_csv(results_df, folder + file)
        for index, row in results_df.iterrows():
            new_results.append([
                emotion, row.algorithm, row.hyperparameter,
                row.weighted_avg_precision, row.weighted_avg_recall,
                row["weighted_avg_f1-score"], row.accuracy,
                row.experiment_type, row.metric_dump_id,
                row.macro_avg_precision, row.macro_avg_recall,
                row["macro_avg_f1-score"], row[emotion + "_precision"],
                row[emotion + "_recall"], row[emotion + "_f1-score"],
                row["no_" + emotion + "_precision"],
                row["no_" + emotion + "_recall"],
                row["no_" + emotion + "_f1-score"]
            ])
            break
    columns = [
        "emotion", "algorithm", "hyperparameter", "weighted_avg_precision",
        "weighted_avg_recall", "weighted_avg_f1-score", "accuracy",
        "experiment_type", "metric_dump_id", "macro_avg_precision",
        "macro_avg_recall", "macro_avg_f1-score", emotion + "_precision",
        emotion + "_recall", emotion + "_f1-score",
        "no_" + emotion + "_precision", "no_" + emotion + "_recall",
        "no_" + emotion + "_f1-score"
    ]
    new_results_df = pd.DataFrame(new_results, columns=columns)
    helpers.dataframe_to_csv(new_results_df,
                             folder + "best_result_per_emotion.csv")
def prepare_data():
    dataset_raw = load_dataset('data.gzip.pkl')

    for n in range(22, 65):
        for m in range(0, 11):
            del dataset_raw[n]['data'][m]['time_stamp']
            del dataset_raw[n]['data'][m]['Accelerometer_X']
            del dataset_raw[n]['data'][m]['Accelerometer_Y']
            del dataset_raw[n]['data'][m]['Accelerometer_Z']
            del dataset_raw[n]['data'][m]['AUX_RIGHT']
            del dataset_raw[n]['data'][m]['Gyro_X']
            del dataset_raw[n]['data'][m]['Gyro_Y']
            del dataset_raw[n]['data'][m]['Gyro_Z']
            del dataset_raw[n]['data'][m]['RAW_TP9']
            del dataset_raw[n]['data'][m]['RAW_TP10']
            del dataset_raw[n]['data'][m]['RAW_AF7']
            del dataset_raw[n]['data'][m]['RAW_AF8']
            del dataset_raw[n]['data'][m]['HSI_TP9']
            del dataset_raw[n]['data'][m]['HSI_TP10']
            del dataset_raw[n]['data'][m]['HSI_AF7']
            del dataset_raw[n]['data'][m]['HSI_AF8']
            del dataset_raw[n]['data'][m]['HeadBandOn']
            del dataset_raw[n]['data'][m]['Battery']
    return dataset_raw
import cv2  # computer vision library
import helpers
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Image data directories
image_dir_training = "day_night_images/training/"
image_dir_test = "day_night_images/test/"

# Using the load_dataset function in helpers.py
# Load training data
IMAGE_LIST = helpers.load_dataset(image_dir_training)

# Standardize all training images
STANDARDIZED_LIST = helpers.standardize(IMAGE_LIST)

# Display a standardized image and its label

# Select an image by index
image_num = 0
selected_image = STANDARDIZED_LIST[image_num][0]
selected_label = STANDARDIZED_LIST[image_num][1]

# Display image and data about it
# plt.imshow(selected_image)
# print("Shape: "+str(selected_image.shape))
# print("Label [1 = day, 0 = night]: " + str(selected_label))


# Find the average Value or brightness of an image
# ## Load the datasets
#
# These first few lines of code will load the training traffic light images and store all of them in a variable, `IMAGE_LIST`. This list contains the images and their associated label ("red", "yellow", "green").
#
# You are encouraged to take a look at the `load_dataset` function in the helpers.py file. This will give you a good idea about how lots of image files can be read in from a directory using the [glob library](https://pymotw.com/2/glob/). The `load_dataset` function takes in the name of an image directory and returns a list of images and their associated labels.
#
# For example, the first image-label pair in `IMAGE_LIST` can be accessed by index:
# ``` IMAGE_LIST[0][:]```.
#

# In[3]:

# Using the load_dataset function in helpers.py
# Load training data
IMAGE_LIST = helpers.load_dataset(IMAGE_DIR_TRAINING)

# ## Visualize the Data
#
# The first steps in analyzing any dataset are to 1. load the data and 2. look at the data. Seeing what it looks like will give you an idea of what to look for in the images, what kind of noise or inconsistencies you have to deal with, and so on. This will help you understand the image dataset, and **understanding a dataset is part of making predictions about the data**.

# ---
# ### Visualize the input images
#
# Visualize and explore the image data! Write code to display an image in `IMAGE_LIST`:
# * Display the image
# * Print out the shape of the image
# * Print out its corresponding label
#
# See if you can display at least one of each type of traffic light image – red, green, and yellow — and look at their similarities and differences.
Beispiel #30
0
print('Getting model...')
result = None
with open(os.path.join(model_path, 'train.pickle'), 'rb') as f:
    result = pickle.load(f)

label_probs = result[0]
probs_per_label = result[1]
words = result[2]
labels = result[3]

predictor = partial(helpers.predict,
                    label_probs, probs_per_label, words, labels)

if os.path.isdir(input_path):
    print('Loading dataset...')
    test_target, test_data = helpers.load_dataset(input_path)

    print('Testing dataset...')
    print("============= RESULT ===============")
    accuracy = helpers.get_accuracy(test_data, test_target, predictor)
    print('Accuracy: {0:.2f}%'.format(accuracy * 100))
    print('=====================================')
else:
    print('Testing...')
    label, tokens = helpers.load_file(input_path)
    predict_label = predictor(tokens)
    print('================= RESULT ================')
    print('Expected label for the text: {}'.format(label))
    print('Predicted label for the text: {}'.format(predict_label))
    print('==========================================')
        urllib.request.urlretrieve(url, file_path)
        print('+ %s' % url)
    except IOError as e:
        print('%s - %s' % (url, e), file=sys.stderr)

def url_to_file_name(url):
    if url:
        file_name = sha256(url.encode('utf-8')).hexdigest()
        extension = url.split('.')[-1]
        if len(extension) > 4:
            return file_name
        else:
            return '%s.%s' % (file_name, extension)

if not os.path.exists(PATH):
    os.mkdir(PATH)

dataset = load_dataset(CSV_PATH)
dataset[URL_COLUMN] = dataset[URL_COLUMN].astype(np.str).replace({'nan': None})
dataset['file_names'] = dataset[URL_COLUMN].map(url_to_file_name)
already_downloaded = dataset['file_names'].isin(os.listdir(PATH))
without_url = dataset[URL_COLUMN].isnull()
remaining_images = dataset[~(already_downloaded | without_url)]

print('Remaining: %i' % len(remaining_images))
for index, values in remaining_images.iterrows():
    url = dict(values)[URL_COLUMN]
    file_path = '%s/%s' % (PATH, url_to_file_name(url))
    time.sleep(1)
    download_image(url, file_path)
import flickrapi
from helpers import flickr_api, load_dataset
import pandas as pd
import sys

ids_dataset_path = 'datasetFlickrID.txt'
urls_dataset_path = 'psed_images.csv'
file_headers = ['photo_id', 'url']
flickr = flickr_api('config.ini')
ids_dataset = load_dataset(ids_dataset_path, ['photo_id'])
urls_dataset = load_dataset(urls_dataset_path, file_headers)
remaining_images = ids_dataset[~ids_dataset['photo_id'].isin(urls_dataset['photo_id'])]

for index, photo in remaining_images.iterrows():
    url = None
    photo_id = str(photo['photo_id'])
    try:
        available_sizes = flickr.photos.getSizes(photo_id=photo_id)
        url = available_sizes['sizes']['size'][-1]['source']
        print('+ %s' % photo_id)
    except flickrapi.exceptions.FlickrError as e:
        print('%s - %s' % (photo_id, e), file=sys.stderr)

    row = pd.Series([photo_id, url], index=file_headers)
    urls_dataset = urls_dataset.append(row, ignore_index=True)
    urls_dataset.to_csv(urls_dataset_path, encoding='utf-8', index=False)
Beispiel #33
0
def main(model='mlp', num_epochs=500):
    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    if model == 'mlp':
        network = build_mlp(input_var)
    elif model.startswith('custom_mlp:'):
        depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',')
        network = build_custom_mlp(input_var, int(depth), int(width),
                                   float(drop_in), float(drop_hid))
    elif model == 'cnn':
        network = build_cnn(input_var)
    else:
        print("Unrecognized model type %r." % model)
        return

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
            loss, params, learning_rate=0.01, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100))

    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(
        test_acc / test_batches * 100))

    # Optionally, you could now dump the network weights to a file like this:
    np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
    
    with open('model.dpkl','wb') as p_output:
      pickle.dump(network, p_output)
def is_direct_link(url):
    extensions = ['.gif', '.jpg', '.png']
    regexp = '|'.join([r'(?:\%s\?[\w=&;]+)' % ext for ext in extensions])
    return url[-4:].lower() in extensions or \
        re.search(regexp, url, flags=re.IGNORECASE)

def extract_image_url(url):
    if is_direct_link(url):
        return url
    else:
        return imgur_url(url) or \
            gfycat_url(url) or \
            flickr_url(url)

flickr = flickr_api('config.ini')
reddit_dataset = load_dataset(CSV_PATH)
reddit_dataset = reddit_dataset[~reddit_dataset['is_self']]
reddit_dataset['image_url'] = reddit_dataset['url'].map(extract_image_url)

skipped_rows = reddit_dataset[reddit_dataset['image_url'].isnull()]['url']
print('--- Skipping %i rows' % len(skipped_rows))
if len(skipped_rows):
    print(skipped_rows)

urls_dataset_path = 'reddit_psed_images.csv'
file_headers = ['csv', 'url']
urls_dataset = load_dataset(urls_dataset_path, file_headers)
new_images = ~(reddit_dataset['image_url'].isin(urls_dataset['url']) | \
    reddit_dataset['image_url'].isnull())
urls_to_include = pd.DataFrame()
urls_to_include[['permalink', 'url']] = reddit_dataset[new_images][['permalink', 'image_url']]