Ejemplo n.º 1
0
def load_data(messages_filepath, categories_filepath):
    """
    Loads the data from the given locations, ready to be cleaned
    :param messages_filepath: The file path for the messages data
    :param categories_filepath: The file path for the categorization data
    :return: Merges the 2 sources on 'id' and returns the joined result
    """
    messages = ut.read_csv(messages_filepath)
    categories = ut.read_csv(categories_filepath)

    return pd.merge(messages, categories, on='id', how='inner')
Ejemplo n.º 2
0
def clean_raw_data():
    """
    Cleans the raw data by removing un-necessary columns
    """

    interactions = ut.read_csv('data/raw/user-item-interaction.csv')
    articles = ut.read_csv('data/raw/articles_community.csv')

    del interactions['Unnamed: 0']
    del articles['Unnamed: 0']
    del interactions['Unnamed: 0.1']
    del articles['Unnamed: 0.1']

    interactions.to_csv('data/interactions.csv', index=False)
    articles.to_csv('data/articles.csv', index=False)
Ejemplo n.º 3
0
def create_word_bias_data(disaster_csv, bias_file_name):
    """
    Based on the disaster data, generates a file to store the bias data for word ==> category
    :param disaster_csv: The disaster.csv file path
    :param bias_file_name: The file name of the output file with bias data
    """

    # Read data
    disaster = ut.read_csv(disaster_csv)
    disaster['message'] = disaster['message'].apply(ast.literal_eval)
    non_category_names = [
        'id', 'message', 'original', 'genre_direct', 'genre_news',
        'genre_social'
    ]
    category_names = list(
        dropwhile(lambda x: x in non_category_names, disaster.columns))

    # Record word to category frequency mapping
    bias_data = {}
    total = ut.row_count(disaster)
    for index, row in disaster.iterrows():

        for word in row['message']:

            if word not in bias_data:
                bias_data[word] = {}
                for category_name in category_names:
                    bias_data[word][category_name + '_ones'] = 0
                    bias_data[word][category_name + '_total'] = 0

            for category_name in category_names:
                bias_data[word][category_name + '_ones'] += row[category_name]
                bias_data[word][category_name + '_total'] += 1

        if index % 100 == 0:
            print('Done ' + str(index) + ' of ' + str(total))

    # Generate a data frame from the frequency mapping
    bias = pd.DataFrame()
    bias['word'] = bias_data.keys()

    # Populate each category ones and total column and add it to dataframe
    columns = bias_data[next(iter(bias_data))].keys()
    current_column_data = []
    i = 1
    for column in columns:
        for word in bias_data:
            current_column_data.append(bias_data[word][column])

        bias[column] = current_column_data
        current_column_data = []
        i += 1

    # For each category, calculate the bias based on the ones and total data
    for category_name in category_names:
        bias[category_name +
             '_bias'] = bias[category_name + '_ones'] / bias[category_name +
                                                             '_total']

    bias.to_csv(bias_file_name, index=False)
Ejemplo n.º 4
0
def create_disaster_pipeline(disaster_csv_path, category_name):

    disaster = ut.read_csv(disaster_csv_path)

    print('Getting data...')
    X = disaster['message'].values
    Y = disaster[category_name].values
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3)

    print('Creating pipeline...')
    pipeline = pi.Pipeline([
        ('vect',
         st.CountVectorizer(
             tokenizer=lambda text: (pt.pipe
                                     | __normalize_text__
                                     | __tokenize_text__
                                     | __remove_stopwords__
                                     | __lemmatize_text__)(text))),
        ('tfidf', st.TfidfTransformer()), ('clf', en.RandomForestClassifier())
    ])

    print('Fitting pipeline...')
    pipeline.fit(x_train, y_train)

    print('Predicting with pipeline...')
    y_pred = pipeline.predict(x_test)

    print('Displaying results...')
    display_results(y_test, y_pred)

    pass
Ejemplo n.º 5
0
def create_disaster_sequence(disaster_csv_path, category_name):

    disaster = ut.read_csv(disaster_csv_path)

    print('Getting Data...')
    X = disaster['message'].values
    Y = disaster[category_name].values
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3)

    print('Tokenizing and count vectorizing...')
    vect = st.CountVectorizer(tokenizer=lambda message: (
        pt.pipe
        | __normalize_text__
        | __tokenize_text__
        | __remove_stopwords__
        # | __stem_text__
        | __lemmatize_text__)(message))

    print('Tfidf transforming...')
    tfidf = st.TfidfTransformer()
    classifier = en.RandomForestClassifier()

    print('Fitting classifier on train...')
    x_train_counts = vect.fit_transform(x_train)
    x_train_tfidf = tfidf.fit_transform(x_train_counts)
    classifier.fit(x_train_tfidf, y_train)

    print('Running classifier on test...')
    x_test_counts = vect.transform(x_test)
    x_test_tfidf = tfidf.transform(x_test_counts)
    y_pred = classifier.predict(x_test_tfidf)

    print('Displaying results...')
    display_results(y_test, y_pred)
Ejemplo n.º 6
0
def create_normalized_disaster_to(file_name):
    """
    Normalizes disaster and creates a csv file with the resulting data
    :param file_name: The name to output to
    """

    disaster = ut.read_csv('../../data/disaster.csv')\
        .pipe(nlp.remove_columns)\
        .pipe(nlp.one_hot_encode_genre)\
        .pipe(nlp.normalize_related_category_values)\
        .pipe(nlp.normalize_messages)

    disaster.to_csv(file_name, index=False)
Ejemplo n.º 7
0
def show_disaster_pca_for(category_name):
    """
    Show a PCA where the data points are the word vectors and the targets are the values in the given category
    :param category_name: The disaster category name
    """

    model = gensim.models.Word2Vec.load('disaster.model')

    disaster = ut.read_csv('disaster.csv')

    X = []
    Y = []

    num_rows = ut.row_count(disaster)

    for index, row in disaster.iterrows():
        for word in row['message'].upper().split(' '):
            if word in model.wv.vocab:
                X.append(model[word])
                Y.append(row[category_name])

        if index % 5000 == 0:
            print('Done ' + str(index) + ' of ' + str(num_rows) + ' rows')

    pca = PCA(n_components=2)

    principalComponents = pca.fit_transform(X)
    finalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    finalDf['Is' + category_name] = pd.Series(Y)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component PCA', fontsize=20)

    targets = [0, 1]
    colors = ['r', 'g']
    for target, color in zip(targets, colors):
        indicesToKeep = finalDf['Is' + category_name] == target
        ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'],
                   finalDf.loc[indicesToKeep, 'principal component 2'],
                   c=color,
                   s=5)
    ax.legend(targets)
    ax.grid()

    plt.show()
Ejemplo n.º 8
0
def create_word_vectors(model_bin_file,
                        weather_words_csv,
                        all_words_csv,
                        output_dir,
                        all_word_sample_size=500):

    weather = ut.read_csv(weather_words_csv)
    all = ut.read_csv(all_words_csv).sample(all_word_sample_size)

    model = gensim.models.KeyedVectors.load_word2vec_format(model_bin_file,
                                                            binary=True)

    for index, row in weather.iterrows():
        try:
            np.save(output_dir + '/' + row['word'], model[row['word']])
        except:
            pass

    for index, row in all.iterrows():
        try:
            np.save(output_dir + '/' + row['word'], model[row['word']])
        except:
            pass
Ejemplo n.º 9
0
def print_unique_lengths_of_categories():
    """
    Prints all the different lengths that the 'categories' column has
    (If this print more than 1 number, than the data has a problem)
    """

    lengths = set()
    categories = ut.read_csv('../data/disaster_categories.csv')

    for index, row in categories.iterrows():

        lengths.add(len(row['categories'].split(';')))

    for length in lengths:
        print(length)
Ejemplo n.º 10
0
def print_disaster_category_values():
    """
    Prints all the disaster category values (To find out if the '2's are a mistake)
    """
    disaster = ut.read_csv('data/disaster.csv')
    non_cat_names = ['id', 'message', 'original', 'genre']

    for cat in list(dropwhile(lambda x: x in non_cat_names, disaster.columns)):
        print(cat)
        print('-------------------------')
        for value in disaster[cat].unique():
            print(
                str(value) + ' - ' +
                str(ut.row_count(disaster[disaster[cat] == value])))
        print()
Ejemplo n.º 11
0
def show_weather_pca(word_vector_dir, weather_words_csv):
    """
    Given names of the csv files, plots PCA for weather words vs normal word sample
    :param word_vector_dir: The file name of the csv containing the sample normal words
    :param weather_words_csv: The file name of the csv containing weather related words
    :return:
    """

    weather_words = ut.read_csv(weather_words_csv)

    X = []
    Y = []
    for filename in os.listdir(word_vector_dir):

        X.append(np.load(word_vector_dir + '/' + filename))

        if weather_words['word'].str.contains(filename.replace('.npy',
                                                               '')).any():
            Y.append('weather_related')
        else:
            Y.append('general')

    pca = PCA(n_components=2)

    principalComponents = pca.fit_transform(X)
    finalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    finalDf['Category'] = pd.Series(Y)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component PCA', fontsize=20)

    targets = ['weather_related', 'general']
    colors = ['r', 'g']
    for target, color in zip(targets, colors):
        indicesToKeep = finalDf['Category'] == target
        ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'],
                   finalDf.loc[indicesToKeep, 'principal component 2'],
                   c=color,
                   s=5)
    ax.legend(targets)
    ax.grid()

    plt.show()
Ejemplo n.º 12
0
def find_most_biased_word_for(category_name):
    """
    Goes into the disaster.csv and prints the words that are the strongest indicator of the given category
    :param category_name: The name fo the target category
    """

    disaster = ut.read_csv('disaster.csv')

    num_rows = ut.row_count(disaster)

    word_target_count = {}
    for index, row in disaster.iterrows():

        for word in row['message'].upper().split(' '):

            if word not in word_target_count:
                word_target_count[word] = [0, 0, 0]

            word_target_count[word][row[category_name]] += 1
            word_target_count[word][
                2] = word_target_count[word][1] / word_target_count[word][
                    0] if word_target_count[word][0] > 0 else 2147483648

        if index % 5000 == 0:
            print('Done ' + str(index) + ' of ' + str(num_rows))

    word_corrs = pd.DataFrame()
    word_corrs['word'] = word_target_count.keys()
    word_corrs['zeros'] = pd.Series(
        map(lambda x: x[0], word_target_count.values()))
    word_corrs['ones'] = pd.Series(
        map(lambda x: x[1], word_target_count.values()))
    word_corrs['one2zero'] = pd.Series(
        map(lambda x: x[2], word_target_count.values()))

    word_corrs = word_corrs.sort_values(by=['one2zero'], ascending=False)
    word_corrs.to_csv('word_corrs.csv', index=False)

    for index, row in word_corrs[
            word_corrs['one2zero'] < 2147483648].iterrows():

        print(row['word'] + ' - Ones: ' + str(row['ones']) + ', Zeros: ' +
              str(row['zeros']))
        input()
Ejemplo n.º 13
0
def print_disaster_dupe_summary():
    """
    Goes through merged, and categorized disaster.csv and prints the ids that are duplicates and a preview of the
    messages
    """

    disaster = ut.read_csv('../data/disaster.csv')

    # Check for dupes
    ids = set()
    disaster['id'].apply(lambda x: ids.add(x))

    dupe_ids = []
    for id in ids:
        if ut.row_count(disaster[disaster['id'] == id]) > 1:
            print(id)
            dupe_ids.append(id)

    for dupe_id in dupe_ids:
        print(disaster[disaster['id'] == dupe_id]['message'])
Ejemplo n.º 14
0
def print_word_frequency():
    """
    Prints the word frequency in messages, from most frequent word to least frequent
    """

    messages = ut.read_csv('../disaster.csv')

    message_words = messages['message'].apply(lambda x: x.lower().split(' '))

    word_count = {}
    for message in message_words:

        for word in message:

            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

    for key, value in sorted(word_count.items(),
                             key=lambda item: item[1],
                             reverse=True):
        print(key + ' - ' + str(value))
Ejemplo n.º 15
0
def create_readble_bias(bias_file_name, database_filename, table_name):
    """
    Based on the bias file output, creates new table and saves it to an SQLite DB
    :param bias_file_name: The file with all the word ==> category indicator data
    :param database_filename: The database file name
    :param table_name: The name of the table
    """

    bias = ut.read_csv(bias_file_name)
    readable_bias = pd.DataFrame()

    for column in list(dropwhile(lambda x: '_bias' not in x, bias.columns)):

        category = column.replace('_bias', '')

        bias = bias.sort_values(by=[column], ascending=False)

        readable_bias[category + '_word'] = bias['word']
        readable_bias[category + '_ones'] = bias[category + '_ones']
        readable_bias[category + '_total'] = bias[category + '_total']
        readable_bias[category + '_bias'] = bias[category + '_bias']

    ut.to_db(readable_bias, database_filename, table_name)
Ejemplo n.º 16
0
import utility.util as ut
import sklearn.preprocessing as pp
import numpy as np
import keras.models as km
import keras.layers as kl
import keras as kr
# import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt

# Get and reshape data
aapl_train = ut.read_csv('aapl_train.csv')

training_set = aapl_train.iloc[:, 1:2].values

scaler = pp.MinMaxScaler(feature_range=(0, 1))

training_set_scaled = scaler.fit_transform(training_set)

x_train = []
y_train = []
for i in range(200, 2000):
    x_train.append(training_set_scaled[i - 200:i, 0])
    y_train.append(training_set_scaled[i, 0])

x_train, y_train = np.array(x_train), np.array(y_train)

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

# Build the LSTM
regressor = km.Sequential()
import utility.util as ut
import sklearn.preprocessing as pp
import numpy as np
import keras.models as km
import keras.layers as kl
import keras as kr
# import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt

# Get and reshape data
ftse_train = ut.read_csv('ftse_train.csv')

training_set = ftse_train.iloc[:, 1:2].values

scaler = pp.MinMaxScaler(feature_range=(0, 1))

training_set_scaled = scaler.fit_transform(training_set)

x_train = []
y_train = []
for i in range(50, 2000):
    x_train.append(training_set_scaled[i-50:i,0])
    y_train.append(training_set_scaled[i,0])

x_train, y_train = np.array(x_train), np.array(y_train)

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

# Build the LSTM
regressor = km.Sequential()
Ejemplo n.º 18
0
import utility.util as ut
import sections as se

se.widen_df_display()

articles = ut.read_csv('data/articles.csv')
interactions = ut.read_csv('data/interactions.csv')

##################################################################################
#   Part I: Exploratory Data Analysis
##################################################################################
print('\n\nPress Enter to run Part I...')
input()

# SECTION 1.1
max_views_by_user = se.get_max_views_by_user(interactions)
se.show_num_article_interaction_distribution(interactions)

# SECTION 1.2
print(f'median: {se.get_median_num_article_interaction(interactions)}')
print(f'max views by user: {se.get_max_num_article_interaction(interactions)}')

# SECTION 1.3
articles = se.remove_dupes(articles)
articles.to_csv('data/articles.csv', index=False)

# SECTION 1.4
unique_articles = se.get_num_articles_with_interaction(interactions)
total_articles = se.get_num_articles(articles)
unique_users = se.get_unique_users(interactions)
user_article_interactions = len(interactions)
import utility.util as ut
import sklearn.preprocessing as pp
import numpy as np
import keras.models as km
import keras.layers as kl
import keras as kr
# import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import statistics as st

ut.widen_df_display()

# Get and reshape data
msft_train = ut.read_csv('msft_train.csv')

msft_train['Open_Delta'] = (msft_train['Open'] - msft_train['Open'].shift(1)) / msft_train['Open']
msft_train.at[0, 'Open_Delta'] = 0

training_set = msft_train.iloc[:, 7].values
training_set = np.reshape(training_set, (-1, 1))

x_train = []
y_train = []
for i in range(50, 2000):
    x_train.append(training_set[i-50:i,0])
    y_train.append(training_set[i,0])

x_train, y_train = np.array(x_train), np.array(y_train)

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))