def create_word_bias_data(disaster_csv, bias_file_name): """ Based on the disaster data, generates a file to store the bias data for word ==> category :param disaster_csv: The disaster.csv file path :param bias_file_name: The file name of the output file with bias data """ # Read data disaster = ut.read_csv(disaster_csv) disaster['message'] = disaster['message'].apply(ast.literal_eval) non_category_names = [ 'id', 'message', 'original', 'genre_direct', 'genre_news', 'genre_social' ] category_names = list( dropwhile(lambda x: x in non_category_names, disaster.columns)) # Record word to category frequency mapping bias_data = {} total = ut.row_count(disaster) for index, row in disaster.iterrows(): for word in row['message']: if word not in bias_data: bias_data[word] = {} for category_name in category_names: bias_data[word][category_name + '_ones'] = 0 bias_data[word][category_name + '_total'] = 0 for category_name in category_names: bias_data[word][category_name + '_ones'] += row[category_name] bias_data[word][category_name + '_total'] += 1 if index % 100 == 0: print('Done ' + str(index) + ' of ' + str(total)) # Generate a data frame from the frequency mapping bias = pd.DataFrame() bias['word'] = bias_data.keys() # Populate each category ones and total column and add it to dataframe columns = bias_data[next(iter(bias_data))].keys() current_column_data = [] i = 1 for column in columns: for word in bias_data: current_column_data.append(bias_data[word][column]) bias[column] = current_column_data current_column_data = [] i += 1 # For each category, calculate the bias based on the ones and total data for category_name in category_names: bias[category_name + '_bias'] = bias[category_name + '_ones'] / bias[category_name + '_total'] bias.to_csv(bias_file_name, index=False)
def create_user_item_matrix(interactions): """ Return a matrix with user ids as rows and article ids on the columns with 1 values where a user interacted with an article and a 0 otherwise :param interactions: The interactions data :return: The user matrix """ # Create df with user_id column print('Creating User Id column...') user_item_matrix = pd.DataFrame() email_to_id_mapping, user_id_column = get_email_to_id_mapping(interactions) user_item_matrix['user_id'] = email_to_id_mapping.values() # Create df with zeros for each article_id print('Creating df with zeros...') unique_article_ids = set(interactions['article_id']) article_df = pd.DataFrame(columns=unique_article_ids) user_id_count = ut.row_count(user_item_matrix) current = 1 total = len(article_df.columns) for column in article_df.columns: article_df[column] = np.zeros(user_id_count) ut.update_progress(current, total) current += 1 # Join both dfs print('Joining...') user_item_matrix = user_item_matrix.join(article_df) # Flip switch to 1 for each unique interaction print('Getting unique interactions...') unique_interactions = set( interactions.apply( lambda row: str(row['article_id']) + '--' + str(row['email']), axis=1)) current = 1 total = len(unique_interactions) print('Flipping switches from 0 to 1...') for interaction in unique_interactions: sections = interaction.split('--') article_id = float(sections[0]) email = sections[1] if sections[1] != 'nan' else np.nan user_id = email_to_id_mapping[email] user_item_matrix.loc[user_item_matrix['user_id'] == user_id, article_id] = 1 ut.update_progress(current, total) current += 1 return user_item_matrix
def show_disaster_pca_for(category_name): """ Show a PCA where the data points are the word vectors and the targets are the values in the given category :param category_name: The disaster category name """ model = gensim.models.Word2Vec.load('disaster.model') disaster = ut.read_csv('disaster.csv') X = [] Y = [] num_rows = ut.row_count(disaster) for index, row in disaster.iterrows(): for word in row['message'].upper().split(' '): if word in model.wv.vocab: X.append(model[word]) Y.append(row[category_name]) if index % 5000 == 0: print('Done ' + str(index) + ' of ' + str(num_rows) + ' rows') pca = PCA(n_components=2) principalComponents = pca.fit_transform(X) finalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf['Is' + category_name] = pd.Series(Y) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component PCA', fontsize=20) targets = [0, 1] colors = ['r', 'g'] for target, color in zip(targets, colors): indicesToKeep = finalDf['Is' + category_name] == target ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'], finalDf.loc[indicesToKeep, 'principal component 2'], c=color, s=5) ax.legend(targets) ax.grid() plt.show()
def print_disaster_category_values(): """ Prints all the disaster category values (To find out if the '2's are a mistake) """ disaster = ut.read_csv('data/disaster.csv') non_cat_names = ['id', 'message', 'original', 'genre'] for cat in list(dropwhile(lambda x: x in non_cat_names, disaster.columns)): print(cat) print('-------------------------') for value in disaster[cat].unique(): print( str(value) + ' - ' + str(ut.row_count(disaster[disaster[cat] == value]))) print()
def user_user_recs_part2(user_id, m, user_item_matrix, interactions): """ Loops through the users based on closeness to the input user_id For each user - finds articles the user hasn't seen before and provides them as recs Does this until m recommendations are found :param user_id: The user id :param m: The m top recommendations to get :param user_item_matrix: The user item interaction :param interactions: The raw interaction data :return: The top m recommendations """ seen_article_ids, seen_article_names = get_user_articles( user_id, user_item_matrix, interactions) similar_users_ids = get_top_sorted_users(user_id, user_item_matrix, interactions) recommended_article_ids = [] for similar_users_id in similar_users_ids: # Get similar articles similar_article_ids, similar_article_names = get_user_articles( similar_users_id, user_item_matrix, interactions) # Find the unseen ones unseen = np.setdiff1d(similar_article_ids, seen_article_ids) # Make them seen seen_article_ids = np.concatenate((seen_article_ids, unseen), axis=None) # Add them to recommendations recommended_article_ids = np.concatenate( (recommended_article_ids, unseen), axis=None) # Break if we have enough if len(recommended_article_ids) >= m: break # Sort ids by number of interactions and then prune lowest recommended_article_ids = ex.dictionary( (article_id, ut.row_count(interactions[interactions['article_id'] == float( article_id)])) for article_id in recommended_article_ids).get_sorted().key_list()[0:m] # Get article names recommended_article_names = get_article_names(recommended_article_ids, interactions) return recommended_article_ids, recommended_article_names
def pca_compare_categories(disaster_df, category_zero, category_one): """ Show 2D PCA to contrast 2 categories in the disaster df :param disaster_df: The NORMALIZED disaster df :param category_zero: The first category :param category_one: The second category """ word_frequency = get_disaster_word_frequency(disaster_df) total = ut.row_count(disaster_df) words = [] X = [] Y = [] for index, row in disaster_df.iterrows(): message_vectors = [] for word in row['message']: # Disregards less than 50 instances if word_frequency[word] < 50: continue # Process the rest vector, op_success = ut.try_word2vec(word) if op_success: words.append(word) message_vectors.append(vector) if message_vectors: X.append(np.average(message_vectors, axis=0)) if row[category_zero] == 0 and row[category_one] == 0: Y.append('Neither') if row[category_zero] == 0 and row[category_one] == 1: Y.append(category_one) if row[category_zero] == 1 and row[category_one] == 0: Y.append(category_zero) if row[category_zero] == 1 and row[category_one] == 1: Y.append('Both') if index % 10000 == 0: print('Done ' + str(index) + ' of ' + str(total)) ut.show_2d_pca(X, Y, ['red', 'green', 'blue', 'purple'])
def find_most_biased_word_for(category_name): """ Goes into the disaster.csv and prints the words that are the strongest indicator of the given category :param category_name: The name fo the target category """ disaster = ut.read_csv('disaster.csv') num_rows = ut.row_count(disaster) word_target_count = {} for index, row in disaster.iterrows(): for word in row['message'].upper().split(' '): if word not in word_target_count: word_target_count[word] = [0, 0, 0] word_target_count[word][row[category_name]] += 1 word_target_count[word][ 2] = word_target_count[word][1] / word_target_count[word][ 0] if word_target_count[word][0] > 0 else 2147483648 if index % 5000 == 0: print('Done ' + str(index) + ' of ' + str(num_rows)) word_corrs = pd.DataFrame() word_corrs['word'] = word_target_count.keys() word_corrs['zeros'] = pd.Series( map(lambda x: x[0], word_target_count.values())) word_corrs['ones'] = pd.Series( map(lambda x: x[1], word_target_count.values())) word_corrs['one2zero'] = pd.Series( map(lambda x: x[2], word_target_count.values())) word_corrs = word_corrs.sort_values(by=['one2zero'], ascending=False) word_corrs.to_csv('word_corrs.csv', index=False) for index, row in word_corrs[ word_corrs['one2zero'] < 2147483648].iterrows(): print(row['word'] + ' - Ones: ' + str(row['ones']) + ', Zeros: ' + str(row['zeros'])) input()
def try_nn_avgvec_with(disaster_df, category_name, outout_model_filename): """ Try training a simple NN to predict the given category (Averages word vectors in 1 message) :param disaster_df: The NORMALIZED disaster df :param category_name: The output category name :param outout_model_filename: The file path to output the model to """ word_frequency = get_disaster_word_frequency(disaster_df) total = ut.row_count(disaster_df) words = [] X = [] Y = [] for index, row in disaster_df.iterrows(): message_vectors = [] for word in row['message']: # Disregards less than 50 instances if word_frequency[word] < 50: continue # Process the rest vector, op_success = ut.try_word2vec(word) if op_success: words.append(word) message_vectors.append(vector) if message_vectors: X.append(np.average(message_vectors, axis=0)) Y.append(row[category_name]) if index % 10000 == 0: print('Done ' + str(index) + ' of ' + str(total)) nn_train_save_show_results(X, Y, hidden_layer_sizes=(8, 5, 5, 5), model_file_name=outout_model_filename, solver='lbfgs', max_iter=100000)
def print_disaster_dupe_summary(): """ Goes through merged, and categorized disaster.csv and prints the ids that are duplicates and a preview of the messages """ disaster = ut.read_csv('../data/disaster.csv') # Check for dupes ids = set() disaster['id'].apply(lambda x: ids.add(x)) dupe_ids = [] for id in ids: if ut.row_count(disaster[disaster['id'] == id]) > 1: print(id) dupe_ids.append(id) for dupe_id in dupe_ids: print(disaster[disaster['id'] == dupe_id]['message'])
def show_disaster_tsne(disaster_df, category_name): """ Perform t-SNE dimensionality reduction on the average of the message word vectors and labels the cluster based on the category name :param disaster_df: The NORMALIZED disaster df :param category_name: The category to use for labelling """ word_frequency = get_disaster_word_frequency(disaster_df) disaster_df = disaster_df.sample(10000) total = ut.row_count(disaster_df) words = [] X = [] Y = [] for index, row in disaster_df.iterrows(): message_vectors = [] for word in row['message']: # Disregards less than 50 instances if word_frequency[word] < 50: continue # Process the rest vector, op_success = ut.try_word2vec(word) if op_success: words.append(word) message_vectors.append(vector) if message_vectors: X.append(np.average(message_vectors, axis=0)) Y.append(row[category_name]) if index % 10000 == 0: print('Done ' + str(index) + ' of ' + str(total)) ut.show_2d_tsne(X, Y, ['r', 'g'])
def get_article_id_frequency(articles): """ Get the frequency with which each article Id appears :param articles: The articles data :return: The frequency mapping """ print('Scanning articles for dupes...') total = ut.row_count(articles) article_frequency_mapping = {} for index, row in articles.iterrows(): if row['article_id'] in article_frequency_mapping: article_frequency_mapping[row['article_id']] += 1 else: article_frequency_mapping[row['article_id']] = 1 ut.update_progress(index, total) print('\n') for article_id, frequency in ut.sorted_dictionary( article_frequency_mapping, ascending=False): print(f'Article Id: {article_id} appeared {frequency} times') input()
def try_nn_with(disaster_df, category_name): """ Try training a neural network for the given category output :param disaster_df: The NORMALIZED disaster df :param category_name: The category name """ word_frequency = get_disaster_word_frequency(disaster_df) total = ut.row_count(disaster_df) words = [] X = [] Y = [] for index, row in disaster_df.iterrows(): for word in row['message']: # Dis-regard less than 50 instances if word_frequency[word] < 50: continue # Process the rest words.append(word) vector, op_success = ut.try_word2vec(word) if (op_success): X.append(vector) Y.append(row[category_name]) if index % 10000 == 0: print('Done ' + str(index) + ' of ' + str(total)) nn_train_save_show_results( X, Y, hidden_layer_sizes=(60, 30), model_file_name='investigation_results/try_nn/first_model.pkl')
def show_disaster_pca_avgvec(disaster_df, category_name): """ Show 2D pca for given category based on disaster data :param disaster_df: The NORMALIZED disaster df :param category_name: The category for PCA """ word_frequency = get_disaster_word_frequency(disaster_df) total = ut.row_count(disaster_df) words = [] X = [] Y = [] for index, row in disaster_df.iterrows(): message_vectors = [] for word in row['message']: # Disregards less than 50 instances if word_frequency[word] < 50: continue # Process the rest vector, op_success = ut.try_word2vec(word) if op_success: words.append(word) message_vectors.append(vector) if message_vectors: X.append(np.average(message_vectors, axis=0)) Y.append(row[category_name]) if index % 10000 == 0: print('Done ' + str(index) + ' of ' + str(total)) ut.show_2d_pca(X, Y, ['r', 'g'])