def create_word_bias_data(disaster_csv, bias_file_name):
    """
    Based on the disaster data, generates a file to store the bias data for word ==> category
    :param disaster_csv: The disaster.csv file path
    :param bias_file_name: The file name of the output file with bias data
    """

    # Read data
    disaster = ut.read_csv(disaster_csv)
    disaster['message'] = disaster['message'].apply(ast.literal_eval)
    non_category_names = [
        'id', 'message', 'original', 'genre_direct', 'genre_news',
        'genre_social'
    ]
    category_names = list(
        dropwhile(lambda x: x in non_category_names, disaster.columns))

    # Record word to category frequency mapping
    bias_data = {}
    total = ut.row_count(disaster)
    for index, row in disaster.iterrows():

        for word in row['message']:

            if word not in bias_data:
                bias_data[word] = {}
                for category_name in category_names:
                    bias_data[word][category_name + '_ones'] = 0
                    bias_data[word][category_name + '_total'] = 0

            for category_name in category_names:
                bias_data[word][category_name + '_ones'] += row[category_name]
                bias_data[word][category_name + '_total'] += 1

        if index % 100 == 0:
            print('Done ' + str(index) + ' of ' + str(total))

    # Generate a data frame from the frequency mapping
    bias = pd.DataFrame()
    bias['word'] = bias_data.keys()

    # Populate each category ones and total column and add it to dataframe
    columns = bias_data[next(iter(bias_data))].keys()
    current_column_data = []
    i = 1
    for column in columns:
        for word in bias_data:
            current_column_data.append(bias_data[word][column])

        bias[column] = current_column_data
        current_column_data = []
        i += 1

    # For each category, calculate the bias based on the ones and total data
    for category_name in category_names:
        bias[category_name +
             '_bias'] = bias[category_name + '_ones'] / bias[category_name +
                                                             '_total']

    bias.to_csv(bias_file_name, index=False)
Beispiel #2
0
def create_user_item_matrix(interactions):
    """
    Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with
    an article and a 0 otherwise
    :param interactions: The interactions data
    :return: The user matrix
    """

    # Create df with user_id column
    print('Creating User Id column...')
    user_item_matrix = pd.DataFrame()
    email_to_id_mapping, user_id_column = get_email_to_id_mapping(interactions)
    user_item_matrix['user_id'] = email_to_id_mapping.values()

    # Create df with zeros for each article_id
    print('Creating df with zeros...')
    unique_article_ids = set(interactions['article_id'])
    article_df = pd.DataFrame(columns=unique_article_ids)
    user_id_count = ut.row_count(user_item_matrix)
    current = 1
    total = len(article_df.columns)
    for column in article_df.columns:
        article_df[column] = np.zeros(user_id_count)
        ut.update_progress(current, total)
        current += 1

    # Join both dfs
    print('Joining...')
    user_item_matrix = user_item_matrix.join(article_df)

    # Flip switch to 1 for each unique interaction
    print('Getting unique interactions...')
    unique_interactions = set(
        interactions.apply(
            lambda row: str(row['article_id']) + '--' + str(row['email']),
            axis=1))

    current = 1
    total = len(unique_interactions)
    print('Flipping switches from 0 to 1...')
    for interaction in unique_interactions:
        sections = interaction.split('--')
        article_id = float(sections[0])
        email = sections[1] if sections[1] != 'nan' else np.nan
        user_id = email_to_id_mapping[email]
        user_item_matrix.loc[user_item_matrix['user_id'] == user_id,
                             article_id] = 1
        ut.update_progress(current, total)
        current += 1

    return user_item_matrix
def show_disaster_pca_for(category_name):
    """
    Show a PCA where the data points are the word vectors and the targets are the values in the given category
    :param category_name: The disaster category name
    """

    model = gensim.models.Word2Vec.load('disaster.model')

    disaster = ut.read_csv('disaster.csv')

    X = []
    Y = []

    num_rows = ut.row_count(disaster)

    for index, row in disaster.iterrows():
        for word in row['message'].upper().split(' '):
            if word in model.wv.vocab:
                X.append(model[word])
                Y.append(row[category_name])

        if index % 5000 == 0:
            print('Done ' + str(index) + ' of ' + str(num_rows) + ' rows')

    pca = PCA(n_components=2)

    principalComponents = pca.fit_transform(X)
    finalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    finalDf['Is' + category_name] = pd.Series(Y)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component PCA', fontsize=20)

    targets = [0, 1]
    colors = ['r', 'g']
    for target, color in zip(targets, colors):
        indicesToKeep = finalDf['Is' + category_name] == target
        ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'],
                   finalDf.loc[indicesToKeep, 'principal component 2'],
                   c=color,
                   s=5)
    ax.legend(targets)
    ax.grid()

    plt.show()
def print_disaster_category_values():
    """
    Prints all the disaster category values (To find out if the '2's are a mistake)
    """
    disaster = ut.read_csv('data/disaster.csv')
    non_cat_names = ['id', 'message', 'original', 'genre']

    for cat in list(dropwhile(lambda x: x in non_cat_names, disaster.columns)):
        print(cat)
        print('-------------------------')
        for value in disaster[cat].unique():
            print(
                str(value) + ' - ' +
                str(ut.row_count(disaster[disaster[cat] == value])))
        print()
Beispiel #5
0
def user_user_recs_part2(user_id, m, user_item_matrix, interactions):
    """
    Loops through the users based on closeness to the input user_id
    For each user - finds articles the user hasn't seen before and provides them as recs
    Does this until m recommendations are found
    :param user_id: The user id
    :param m: The m top recommendations to get
    :param user_item_matrix: The user item interaction
    :param interactions: The raw interaction data
    :return: The top m recommendations
    """

    seen_article_ids, seen_article_names = get_user_articles(
        user_id, user_item_matrix, interactions)
    similar_users_ids = get_top_sorted_users(user_id, user_item_matrix,
                                             interactions)
    recommended_article_ids = []
    for similar_users_id in similar_users_ids:
        # Get similar articles
        similar_article_ids, similar_article_names = get_user_articles(
            similar_users_id, user_item_matrix, interactions)
        # Find the unseen ones
        unseen = np.setdiff1d(similar_article_ids, seen_article_ids)
        # Make them seen
        seen_article_ids = np.concatenate((seen_article_ids, unseen),
                                          axis=None)
        # Add them to recommendations
        recommended_article_ids = np.concatenate(
            (recommended_article_ids, unseen), axis=None)
        # Break if we have enough
        if len(recommended_article_ids) >= m:
            break

    # Sort ids by number of interactions and then prune lowest
    recommended_article_ids = ex.dictionary(
        (article_id,
         ut.row_count(interactions[interactions['article_id'] == float(
             article_id)]))
        for article_id in recommended_article_ids).get_sorted().key_list()[0:m]

    # Get article names
    recommended_article_names = get_article_names(recommended_article_ids,
                                                  interactions)

    return recommended_article_ids, recommended_article_names
def pca_compare_categories(disaster_df, category_zero, category_one):
    """
    Show 2D PCA to contrast 2 categories in the disaster df
    :param disaster_df: The NORMALIZED disaster df
    :param category_zero: The first category
    :param category_one: The second category
    """

    word_frequency = get_disaster_word_frequency(disaster_df)

    total = ut.row_count(disaster_df)

    words = []
    X = []
    Y = []
    for index, row in disaster_df.iterrows():

        message_vectors = []
        for word in row['message']:
            # Disregards less than 50 instances
            if word_frequency[word] < 50:
                continue

            # Process the rest
            vector, op_success = ut.try_word2vec(word)
            if op_success:
                words.append(word)
                message_vectors.append(vector)

        if message_vectors:
            X.append(np.average(message_vectors, axis=0))
            if row[category_zero] == 0 and row[category_one] == 0:
                Y.append('Neither')
            if row[category_zero] == 0 and row[category_one] == 1:
                Y.append(category_one)
            if row[category_zero] == 1 and row[category_one] == 0:
                Y.append(category_zero)
            if row[category_zero] == 1 and row[category_one] == 1:
                Y.append('Both')

        if index % 10000 == 0:
            print('Done ' + str(index) + ' of ' + str(total))

    ut.show_2d_pca(X, Y, ['red', 'green', 'blue', 'purple'])
def find_most_biased_word_for(category_name):
    """
    Goes into the disaster.csv and prints the words that are the strongest indicator of the given category
    :param category_name: The name fo the target category
    """

    disaster = ut.read_csv('disaster.csv')

    num_rows = ut.row_count(disaster)

    word_target_count = {}
    for index, row in disaster.iterrows():

        for word in row['message'].upper().split(' '):

            if word not in word_target_count:
                word_target_count[word] = [0, 0, 0]

            word_target_count[word][row[category_name]] += 1
            word_target_count[word][
                2] = word_target_count[word][1] / word_target_count[word][
                    0] if word_target_count[word][0] > 0 else 2147483648

        if index % 5000 == 0:
            print('Done ' + str(index) + ' of ' + str(num_rows))

    word_corrs = pd.DataFrame()
    word_corrs['word'] = word_target_count.keys()
    word_corrs['zeros'] = pd.Series(
        map(lambda x: x[0], word_target_count.values()))
    word_corrs['ones'] = pd.Series(
        map(lambda x: x[1], word_target_count.values()))
    word_corrs['one2zero'] = pd.Series(
        map(lambda x: x[2], word_target_count.values()))

    word_corrs = word_corrs.sort_values(by=['one2zero'], ascending=False)
    word_corrs.to_csv('word_corrs.csv', index=False)

    for index, row in word_corrs[
            word_corrs['one2zero'] < 2147483648].iterrows():

        print(row['word'] + ' - Ones: ' + str(row['ones']) + ', Zeros: ' +
              str(row['zeros']))
        input()
def try_nn_avgvec_with(disaster_df, category_name, outout_model_filename):
    """
    Try training a simple NN to predict the given category (Averages word vectors in 1 message)
    :param disaster_df: The NORMALIZED disaster df
    :param category_name: The output category name
    :param outout_model_filename: The file path to output the model to
    """

    word_frequency = get_disaster_word_frequency(disaster_df)

    total = ut.row_count(disaster_df)

    words = []
    X = []
    Y = []
    for index, row in disaster_df.iterrows():

        message_vectors = []
        for word in row['message']:
            # Disregards less than 50 instances
            if word_frequency[word] < 50:
                continue

            # Process the rest
            vector, op_success = ut.try_word2vec(word)
            if op_success:
                words.append(word)
                message_vectors.append(vector)

        if message_vectors:
            X.append(np.average(message_vectors, axis=0))
            Y.append(row[category_name])

        if index % 10000 == 0:
            print('Done ' + str(index) + ' of ' + str(total))

    nn_train_save_show_results(X,
                               Y,
                               hidden_layer_sizes=(8, 5, 5, 5),
                               model_file_name=outout_model_filename,
                               solver='lbfgs',
                               max_iter=100000)
def print_disaster_dupe_summary():
    """
    Goes through merged, and categorized disaster.csv and prints the ids that are duplicates and a preview of the
    messages
    """

    disaster = ut.read_csv('../data/disaster.csv')

    # Check for dupes
    ids = set()
    disaster['id'].apply(lambda x: ids.add(x))

    dupe_ids = []
    for id in ids:
        if ut.row_count(disaster[disaster['id'] == id]) > 1:
            print(id)
            dupe_ids.append(id)

    for dupe_id in dupe_ids:
        print(disaster[disaster['id'] == dupe_id]['message'])
def show_disaster_tsne(disaster_df, category_name):
    """
    Perform t-SNE dimensionality reduction on the average of the message word vectors and labels the cluster based
    on the category name
    :param disaster_df: The NORMALIZED disaster df
    :param category_name: The category to use for labelling
    """

    word_frequency = get_disaster_word_frequency(disaster_df)

    disaster_df = disaster_df.sample(10000)

    total = ut.row_count(disaster_df)

    words = []
    X = []
    Y = []
    for index, row in disaster_df.iterrows():

        message_vectors = []
        for word in row['message']:
            # Disregards less than 50 instances
            if word_frequency[word] < 50:
                continue

            # Process the rest
            vector, op_success = ut.try_word2vec(word)
            if op_success:
                words.append(word)
                message_vectors.append(vector)

        if message_vectors:
            X.append(np.average(message_vectors, axis=0))
            Y.append(row[category_name])

        if index % 10000 == 0:
            print('Done ' + str(index) + ' of ' + str(total))

    ut.show_2d_tsne(X, Y, ['r', 'g'])
def get_article_id_frequency(articles):
    """
    Get the frequency with which each article Id appears
    :param articles: The articles data
    :return: The frequency mapping
    """

    print('Scanning articles for dupes...')
    total = ut.row_count(articles)
    article_frequency_mapping = {}
    for index, row in articles.iterrows():
        if row['article_id'] in article_frequency_mapping:
            article_frequency_mapping[row['article_id']] += 1
        else:
            article_frequency_mapping[row['article_id']] = 1
        ut.update_progress(index, total)
    print('\n')

    for article_id, frequency in ut.sorted_dictionary(
            article_frequency_mapping, ascending=False):
        print(f'Article Id: {article_id} appeared {frequency} times')
        input()
def try_nn_with(disaster_df, category_name):
    """
    Try training a neural network for the given category output
    :param disaster_df: The NORMALIZED disaster df
    :param category_name: The category name
    """

    word_frequency = get_disaster_word_frequency(disaster_df)

    total = ut.row_count(disaster_df)

    words = []
    X = []
    Y = []
    for index, row in disaster_df.iterrows():

        for word in row['message']:
            # Dis-regard less than 50 instances
            if word_frequency[word] < 50:
                continue

            # Process the rest
            words.append(word)
            vector, op_success = ut.try_word2vec(word)
            if (op_success):
                X.append(vector)
                Y.append(row[category_name])

        if index % 10000 == 0:
            print('Done ' + str(index) + ' of ' + str(total))

    nn_train_save_show_results(
        X,
        Y,
        hidden_layer_sizes=(60, 30),
        model_file_name='investigation_results/try_nn/first_model.pkl')
def show_disaster_pca_avgvec(disaster_df, category_name):
    """
    Show 2D pca for given category based on disaster data
    :param disaster_df: The NORMALIZED disaster df
    :param category_name: The category for PCA
    """

    word_frequency = get_disaster_word_frequency(disaster_df)

    total = ut.row_count(disaster_df)

    words = []
    X = []
    Y = []
    for index, row in disaster_df.iterrows():

        message_vectors = []
        for word in row['message']:
            # Disregards less than 50 instances
            if word_frequency[word] < 50:
                continue

            # Process the rest
            vector, op_success = ut.try_word2vec(word)
            if op_success:
                words.append(word)
                message_vectors.append(vector)

        if message_vectors:
            X.append(np.average(message_vectors, axis=0))
            Y.append(row[category_name])

        if index % 10000 == 0:
            print('Done ' + str(index) + ' of ' + str(total))

    ut.show_2d_pca(X, Y, ['r', 'g'])