Ejemplo n.º 1
0
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')
    process_html_files(input_filepath, output_filepath)
    clean_data('data/interim/houses.csv', 'data/processed/houses.csv')
Ejemplo n.º 2
0
def main():
    my_filename = "raw_accel.csv"
    
    #calls the cleaning function with params: filepath, outputy filename (if desired), and index to plot graph (if desired)
    moves = pd.DataFrame(clean_data(my_filename, None, None))
    X = moves[moves.columns[2:]]
    y = moves['move']
#    plot_graph(moves, 'z')
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    neighbours = 15
    knn_model = KNeighborsClassifier(n_neighbors=neighbours)
    nb_model = GaussianNB()
    knn_model.fit(X_train, y_train)
    nb_model.fit(X_train, y_train)
    my_knn_score = knn_model.score(X_test, y_test)
    my_nb_score = nb_model.score(X_test, y_test)
    
    a_filename = "subject_a.csv"
    a_moves = pd.DataFrame(clean_data(a_filename, None, None))
    X_a = a_moves[a_moves.columns[2:]]
    y_a = a_moves['move']
    a_score = knn_model.score(X_a, y_a)
    
    b_filename = "subject_b.csv"
    b_moves = pd.DataFrame(clean_data(b_filename, None, None))
    X_b = b_moves[b_moves.columns[2:]]
    y_b = b_moves['move']
    b_score = knn_model.score(X_b, y_b)
    
    
    c_filename = "subject_c.csv"
    c_moves = pd.DataFrame(clean_data(c_filename, None, None))
    X_c = c_moves[c_moves.columns[2:]]
    y_c = c_moves['move']
    c_score = knn_model.score(X_c, y_c)
    
    d_filename = "subject_d.csv"
    d_moves = pd.DataFrame(clean_data(d_filename, None, None))
    X_d = d_moves[d_moves.columns[2:]]
    y_d = d_moves['move']
    d_score = knn_model.score(X_d, y_d)

    others = pd.concat([a_moves, b_moves, c_moves, d_moves])

    others_score = knn_model.score(others[others.columns[2:]], others['move'])
    
    print(OUTPUT_TEMPLATE.format(
        bayes=my_nb_score,
        knn=my_knn_score,
        subj_a = a_score,
        subj_b = b_score,
        subj_c = c_score,
        subj_d = d_score,
        subj_all = others_score
    ))
Ejemplo n.º 3
0
def cleaner():
    # put the full path name to these files
    features_to_keep = './features_to_keep.txt'

    # decision_scheduling_merge = './../data/dsmfc_short.csv'
    decision_scheduling_merge = './../../data/_decision_scheduling_merge_final_converted_1000.csv'

    # clean the data
    print 'Cleaning data with courts_data'
    courts_data = clean_data(features_to_keep, decision_scheduling_merge)

    # make features
    print 'Making history features add_history_features_to_courts_data'
    add_history_features_to_courts_data(courts_data, dict_of_groupbys)
    courts_data.fillna(0, inplace=True)
    print courts_data.head(6)

    print 'Making dummy features make_dummies'
    courts_data = make_dummies(courts_data, categoricals)

    print 'Making time features make_hearing_edate_features, make_hearing_half_hour'
    make_hearing_edate_features(courts_data)
    make_hearing_half_hour(courts_data)

    courts_data.to_csv('./../../data/cleaned_with_features.csv', index=False)
Ejemplo n.º 4
0
    def __init__(self, path, always_apply=False, p=0.5):
        super(SynthesicOpenSubtitlesTransform, self).__init__(always_apply, p)
        df = pd.read_csv(path,
                         index_col='id')[['comment_text', 'toxic', 'lang']]
        df = df[~df['comment_text'].isna()]
        df = cleaning.clean_data(df, ['comment_text'])
        df = df.drop_duplicates(subset='comment_text')
        df['toxic'] = df['toxic'].round().astype(np.int)

        self.synthesic_toxic = df[df['toxic'] == 1].comment_text.values
        self.synthesic_non_toxic = df[df['toxic'] == 0].comment_text.values

        del df
        gc.collect()
Ejemplo n.º 5
0
def job():
    cursor = db_connection.conn.cursor()

    # Get required data
    covid_cases = data.getCovidCases()
    geo_neighborhoods = data.getGeoData()

    # Clean data
    clean_data = data_cleaning.clean_data(covid_cases, geo_neighborhoods)

    # Create map
    map.create_map(clean_data)

    map_html = open('Covid-19_confirmed_cases_fortaleza.html', 'r').read()

    cursor.execute('UPDATE COVID19 SET MAP_HTML = %s WHERE ID = 1',
                   (map_html, ))

    db_connection.conn.commit()
Ejemplo n.º 6
0
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xg

from load_data import load_test_data, load_train_data
from data_cleaning import clean_data

# laod data
data = clean_data(load_train_data())

# split data into training/testing sets
train,test=train_test_split(data,test_size=0.3,random_state=0,stratify=data['Survived'])
train_X=train[train.columns[1:]]
train_Y=train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X=data[data.columns[1:]]
Y=data['Survived']

# Radial Support Vector Machines(rbf-SVM)
model=svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(train_X,train_Y)
prediction1=model.predict(test_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))
Ejemplo n.º 7
0
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import data_cleaning as cleaner
import SurpriseMatrixFactorization as sp


# data cleaning (only need to run once if you store the cleaned files)
path_to_original_movies_file = '../data/movies.txt'
path_to_original_data='../data/data.txt'
movies, duplicate_count, replace_table = \
    cleaner.clean_movies(path_to_original_movies_file, save=True)
data = cleaner.clean_data(replace_table, path_to_original_data, save_new_data='txt')
path_to_train_data='../data/train.txt'
train_data = cleaner.clean_data(replace_table, path_to_train_data, save_new_data='txt')
path_to_test_data='../data/test.txt'
test_data = cleaner.clean_data(replace_table, path_to_test_data, save_new_data='txt')


################
# SVD (biased) #
################

n_factors=100
n_epochs=20
lr_all=0.005
reg_all=0.02
Ejemplo n.º 8
0
import numpy as np
import pickle
# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_score, accuracy_score
from algorithms import RF, GBC, ABC, cross_validate
from data_cleaning import clean_data

# Load data
df = pd.read_json('data/data.zip')

X = clean_data(df)
y = df.acct_type.map(lambda x: 1 if ((x == 'fraudster') | \
                                    (x == 'fraudster_att') | \
                                    (x == 'fraudster_event')) \
                                    else 0)

# Run Classification Algorithms
y_hat_RF, y_pred_RF, y_test_RF = cross_validate(X, y, 'RF')
y_hat_GBC, y_pred_GBC, y_test_GBC = cross_validate(X, y, 'GBC')
y_hat_ABC, y_pred_ABC, y_test_ABC = cross_validate(X, y, 'ABC')

# Evaluate Performance to pick best model
precision_rf = precision_score(y_test_RF, y_pred_RF)
precision_g = precision_score(y_test_GBC, y_pred_GBC)
precision_a = precision_score(y_test_ABC, y_pred_ABC)
acc_rf = accuracy_score(y_test_RF, y_pred_RF)
Ejemplo n.º 9
0
    "It does that by taking the recent 300 tweets from that topic/hashtag.")
st.markdown(
    "<sub>Extracting 300 tweets and cleaning them takes time so please wait for a minute or two</sub>",
    unsafe_allow_html=True)
phrase = st.text_input("enter the hashtag or the keyword")

activities = [
    "Histogram of Positive, Negative and Neutral tweets",
    "Polarity-Subjectivity Scatterplot", "Top five Positive tweets",
    "Top five Negative tweets"
]
choice = st.selectbox("Select Your Activity", activities)
if phrase is not "":
    get_tweets.insert_to_csv(hashtag_phrase=phrase)
    st.success("Scraping successful. Cleaning and formatting data...")
    data_cleaning.clean_data()

    plt.style.use('fivethirtyeight')
    df = pd.read_csv("cleaned_output.csv")

    def getSubjectivity(text):
        return float(TextBlob(text).sentiment.subjectivity)

    def getPolarity(text):
        return float(TextBlob(text).sentiment.polarity)

    def getAnalysis(score):
        if score < 0:
            return "Negative"
        elif score == 0:
            return "Neutral"
Ejemplo n.º 10
0
import sys
import json
from pprint import pprint
import string
import data_cleaning as dc
#if len(sys.argv) > 1:
argument='{"query":"Kms api/services failing with 404 errorcode","Tag":"kms/engagement"}'
print(argument)
if isinstance(argument, str):
    query = json.loads(argument)
    tokened_query=dc.clean_data(query['query'])
    print(tokened_query)
Ejemplo n.º 11
0
from email_sender import EmailSender
from crawler import LinkedInCrawler
from data_cleaning import clean_data

if __name__ == '__main__':
    """
    This is how the workflow is supposed to run. However, if the user wants to run the program to test it, 
    I recommend running the scripts separately as one collection of one company in one city alone is ranging 
    between 40-45m. The user can also set a small number of pages to be searched by including the desired number(int)
    to be searched by inserting it as a param in the get_data function below. 
    If the scripts are to be run separately this would be the order to run the files run the files:
    crawler_linkedin.py > data_cleaning.py > email_sender.py

    If the user doesn't want to pester the Uber's employees again just make a list with a few names(first and last)
    and substitute the param in send_email function.
    """
    start = time()

    cities = ['São Paulo', 'San Francisco']
    company = ['Uber']
    # TODO: You must install ChromeDrive in your computer from https://chromedriver.chromium.org/ for selenium to work

    # Collects the results from the all cities and companies combinations
    raw_data = LinkedInCrawler.get_data(cities, company)
    # Clean the data
    cleaned_data = clean_data(raw_data)
    # Send the emails
    EmailSender.send_email(cleaned_data)

    total = time() - start
Ejemplo n.º 12
0
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from load_data import load_test_data, load_train_data
from data_cleaning import clean_data

# laod data
train_data = clean_data(load_train_data())
train_data.drop(['PassengerId'], axis=1, inplace=True)
test_data = clean_data(load_test_data())

# split training data into training/testing sets
train,test=train_test_split(train_data,test_size=0.3,random_state=0,stratify=train_data['Survived'])
train_X=train[train.columns[1:]]
train_Y=train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X=train_data[train_data.columns[1:]]
Y=train_data['Survived']

# Hyper-Parameter Tuning for AdaBoost
n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
Ejemplo n.º 13
0
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import os
from sklearn import preprocessing
import GeoSpatialClustering as GSC
from AuxMethods import AuxMethods as Aux
from GeoSpatialPlot import GeoSpatialPlot as GSP
from SpatialAutocorrelation import SpatialAutocorrelation as SAC
import data_cleaning

#%% read data
if False:
    path = r'C:\Users\Vijeta\Documents\Projects\Sizanani\Data'
    dict_maps, dict_geo_df, df_clustering = data_cleaning.clean_data(path)

#%% distribution and correlation

if False:
    # create folder
    if not os.path.exists(os.path.join(path, 'Exploratory plots')):
        os.makedirs(os.path.join(path, 'Exploratory plots'))
    path_expl = os.path.join(path, 'Exploratory plots')
    f, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 12))
    # Make the axes accessible with single indexing
    axs = axs.flatten()

    # Start the loop over all the variables of interest
    for i, col in enumerate(['support', 'CD4Results', 'MHIINDX3']):
        # select the axis where the map will go
def clean_chicago_business_data(self):
    self.logger.info("    Running cleaning steps on raw data")
    self.dataframe = clean_data(self.dataframe, self.data_cleaning)
    return self
Ejemplo n.º 15
0
dict_of_groupbys = {('nat', 'base_city_code'): [5,1], ('nat', 'c_asy_type', 'base_city_code'): [5,1], ('nat','langid', 'c_asy_type', 'base_city_code'): [5,1]c


categoricals = ['nat', 'case_type', 'appl_code', 'c_asy_type', 'base_city_code', 'hearing_loc_code','attorney_flag', 'schedule_type', 'langid']


def cleaner():
    # put the full path name to these files
    features_to_keep = './cleaning_data/features_to_keep.txt'

#    decision_scheduling_merge = './../data/dsmfc_short.csv'
    decision_scheduling_merge = './../data/decision_scheduling_merge_final_converted.csv'

    # clean the data
    print 'Cleaning data with courts_data'
    courts_data = clean_data(features_to_keep, decision_scheduling_merge)

    # make features
    print 'Making history features add_history_features_to_courts_data'
    make_history_features.add_history_features_to_courts_data(courts_data, dict_of_groupbys)
    courts_data.fillna(0, inplace=True)
    print courts_data.head(6)

    print 'Making dummy features make_dummies'
    courts_data = make_dummies.make_dummies(courts_data, categoricals)

    print 'Making time features make_hearing_edate_features, make_hearing_half_hour'
    make_time_features.make_hearing_edate_features(courts_data)
    make_time_features.make_hearing_half_hour(courts_data)

    courts_data.to_csv('./../data/cleaned_with_features.csv', index=False)
Ejemplo n.º 16
0
import matplotlib.pyplot as plt

import data_cleaning as cleaner
import basic_stats as stat

###########################
#   Read and clean data   #
###########################

# import original data and clean
path_to_original_movies_file = '../data/movies.txt'
path_to_original_data = '../data/data.txt'
movies, duplicate_count, replace_table = \
    cleaner.clean_movies(path_to_original_movies_file, save=True)
data = cleaner.clean_data(replace_table,
                          path_to_original_data,
                          save_new_data='npy')

# # or import cleaned data
# path_to_clean_movies_file = '../data/movies_nodup.txt'
# path_to_clean_data_file = '../data/data_clean.npy'
# movies = cleaner.read_movie_as_dataframe(path_to_clean_movies_file)
# data = np.load(path_to_clean_data_file)

# create movie title-ID lookup dictionary
id_title_dict = {}
for index, row in movies.iterrows():
    movie_id = row[0]
    movie_title = row[1]
    id_title_dict[movie_id] = movie_title