def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') process_html_files(input_filepath, output_filepath) clean_data('data/interim/houses.csv', 'data/processed/houses.csv')
def main(): my_filename = "raw_accel.csv" #calls the cleaning function with params: filepath, outputy filename (if desired), and index to plot graph (if desired) moves = pd.DataFrame(clean_data(my_filename, None, None)) X = moves[moves.columns[2:]] y = moves['move'] # plot_graph(moves, 'z') X_train, X_test, y_train, y_test = train_test_split(X, y) neighbours = 15 knn_model = KNeighborsClassifier(n_neighbors=neighbours) nb_model = GaussianNB() knn_model.fit(X_train, y_train) nb_model.fit(X_train, y_train) my_knn_score = knn_model.score(X_test, y_test) my_nb_score = nb_model.score(X_test, y_test) a_filename = "subject_a.csv" a_moves = pd.DataFrame(clean_data(a_filename, None, None)) X_a = a_moves[a_moves.columns[2:]] y_a = a_moves['move'] a_score = knn_model.score(X_a, y_a) b_filename = "subject_b.csv" b_moves = pd.DataFrame(clean_data(b_filename, None, None)) X_b = b_moves[b_moves.columns[2:]] y_b = b_moves['move'] b_score = knn_model.score(X_b, y_b) c_filename = "subject_c.csv" c_moves = pd.DataFrame(clean_data(c_filename, None, None)) X_c = c_moves[c_moves.columns[2:]] y_c = c_moves['move'] c_score = knn_model.score(X_c, y_c) d_filename = "subject_d.csv" d_moves = pd.DataFrame(clean_data(d_filename, None, None)) X_d = d_moves[d_moves.columns[2:]] y_d = d_moves['move'] d_score = knn_model.score(X_d, y_d) others = pd.concat([a_moves, b_moves, c_moves, d_moves]) others_score = knn_model.score(others[others.columns[2:]], others['move']) print(OUTPUT_TEMPLATE.format( bayes=my_nb_score, knn=my_knn_score, subj_a = a_score, subj_b = b_score, subj_c = c_score, subj_d = d_score, subj_all = others_score ))
def cleaner(): # put the full path name to these files features_to_keep = './features_to_keep.txt' # decision_scheduling_merge = './../data/dsmfc_short.csv' decision_scheduling_merge = './../../data/_decision_scheduling_merge_final_converted_1000.csv' # clean the data print 'Cleaning data with courts_data' courts_data = clean_data(features_to_keep, decision_scheduling_merge) # make features print 'Making history features add_history_features_to_courts_data' add_history_features_to_courts_data(courts_data, dict_of_groupbys) courts_data.fillna(0, inplace=True) print courts_data.head(6) print 'Making dummy features make_dummies' courts_data = make_dummies(courts_data, categoricals) print 'Making time features make_hearing_edate_features, make_hearing_half_hour' make_hearing_edate_features(courts_data) make_hearing_half_hour(courts_data) courts_data.to_csv('./../../data/cleaned_with_features.csv', index=False)
def __init__(self, path, always_apply=False, p=0.5): super(SynthesicOpenSubtitlesTransform, self).__init__(always_apply, p) df = pd.read_csv(path, index_col='id')[['comment_text', 'toxic', 'lang']] df = df[~df['comment_text'].isna()] df = cleaning.clean_data(df, ['comment_text']) df = df.drop_duplicates(subset='comment_text') df['toxic'] = df['toxic'].round().astype(np.int) self.synthesic_toxic = df[df['toxic'] == 1].comment_text.values self.synthesic_non_toxic = df[df['toxic'] == 0].comment_text.values del df gc.collect()
def job(): cursor = db_connection.conn.cursor() # Get required data covid_cases = data.getCovidCases() geo_neighborhoods = data.getGeoData() # Clean data clean_data = data_cleaning.clean_data(covid_cases, geo_neighborhoods) # Create map map.create_map(clean_data) map_html = open('Covid-19_confirmed_cases_fortaleza.html', 'r').read() cursor.execute('UPDATE COVID19 SET MAP_HTML = %s WHERE ID = 1', (map_html, )) db_connection.conn.commit()
from sklearn.model_selection import GridSearchCV from sklearn.ensemble import VotingClassifier from sklearn.ensemble import VotingClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import VotingClassifier import xgboost as xg from load_data import load_test_data, load_train_data from data_cleaning import clean_data # laod data data = clean_data(load_train_data()) # split data into training/testing sets train,test=train_test_split(data,test_size=0.3,random_state=0,stratify=data['Survived']) train_X=train[train.columns[1:]] train_Y=train[train.columns[:1]] test_X=test[test.columns[1:]] test_Y=test[test.columns[:1]] X=data[data.columns[1:]] Y=data['Survived'] # Radial Support Vector Machines(rbf-SVM) model=svm.SVC(kernel='rbf',C=1,gamma=0.1) model.fit(train_X,train_Y) prediction1=model.predict(test_X) print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))
""" import pandas as pd import numpy as np import matplotlib.pyplot as plt import data_cleaning as cleaner import SurpriseMatrixFactorization as sp # data cleaning (only need to run once if you store the cleaned files) path_to_original_movies_file = '../data/movies.txt' path_to_original_data='../data/data.txt' movies, duplicate_count, replace_table = \ cleaner.clean_movies(path_to_original_movies_file, save=True) data = cleaner.clean_data(replace_table, path_to_original_data, save_new_data='txt') path_to_train_data='../data/train.txt' train_data = cleaner.clean_data(replace_table, path_to_train_data, save_new_data='txt') path_to_test_data='../data/test.txt' test_data = cleaner.clean_data(replace_table, path_to_test_data, save_new_data='txt') ################ # SVD (biased) # ################ n_factors=100 n_epochs=20 lr_all=0.005 reg_all=0.02
import numpy as np import pickle # from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor from sklearn.model_selection import GridSearchCV, cross_val_predict from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from imblearn.over_sampling import ADASYN from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_score, accuracy_score from algorithms import RF, GBC, ABC, cross_validate from data_cleaning import clean_data # Load data df = pd.read_json('data/data.zip') X = clean_data(df) y = df.acct_type.map(lambda x: 1 if ((x == 'fraudster') | \ (x == 'fraudster_att') | \ (x == 'fraudster_event')) \ else 0) # Run Classification Algorithms y_hat_RF, y_pred_RF, y_test_RF = cross_validate(X, y, 'RF') y_hat_GBC, y_pred_GBC, y_test_GBC = cross_validate(X, y, 'GBC') y_hat_ABC, y_pred_ABC, y_test_ABC = cross_validate(X, y, 'ABC') # Evaluate Performance to pick best model precision_rf = precision_score(y_test_RF, y_pred_RF) precision_g = precision_score(y_test_GBC, y_pred_GBC) precision_a = precision_score(y_test_ABC, y_pred_ABC) acc_rf = accuracy_score(y_test_RF, y_pred_RF)
"It does that by taking the recent 300 tweets from that topic/hashtag.") st.markdown( "<sub>Extracting 300 tweets and cleaning them takes time so please wait for a minute or two</sub>", unsafe_allow_html=True) phrase = st.text_input("enter the hashtag or the keyword") activities = [ "Histogram of Positive, Negative and Neutral tweets", "Polarity-Subjectivity Scatterplot", "Top five Positive tweets", "Top five Negative tweets" ] choice = st.selectbox("Select Your Activity", activities) if phrase is not "": get_tweets.insert_to_csv(hashtag_phrase=phrase) st.success("Scraping successful. Cleaning and formatting data...") data_cleaning.clean_data() plt.style.use('fivethirtyeight') df = pd.read_csv("cleaned_output.csv") def getSubjectivity(text): return float(TextBlob(text).sentiment.subjectivity) def getPolarity(text): return float(TextBlob(text).sentiment.polarity) def getAnalysis(score): if score < 0: return "Negative" elif score == 0: return "Neutral"
import sys import json from pprint import pprint import string import data_cleaning as dc #if len(sys.argv) > 1: argument='{"query":"Kms api/services failing with 404 errorcode","Tag":"kms/engagement"}' print(argument) if isinstance(argument, str): query = json.loads(argument) tokened_query=dc.clean_data(query['query']) print(tokened_query)
from email_sender import EmailSender from crawler import LinkedInCrawler from data_cleaning import clean_data if __name__ == '__main__': """ This is how the workflow is supposed to run. However, if the user wants to run the program to test it, I recommend running the scripts separately as one collection of one company in one city alone is ranging between 40-45m. The user can also set a small number of pages to be searched by including the desired number(int) to be searched by inserting it as a param in the get_data function below. If the scripts are to be run separately this would be the order to run the files run the files: crawler_linkedin.py > data_cleaning.py > email_sender.py If the user doesn't want to pester the Uber's employees again just make a list with a few names(first and last) and substitute the param in send_email function. """ start = time() cities = ['São Paulo', 'San Francisco'] company = ['Uber'] # TODO: You must install ChromeDrive in your computer from https://chromedriver.chromium.org/ for selenium to work # Collects the results from the all cities and companies combinations raw_data = LinkedInCrawler.get_data(cities, company) # Clean the data cleaned_data = clean_data(raw_data) # Send the emails EmailSender.send_email(cleaned_data) total = time() - start
import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') from sklearn.ensemble import AdaBoostClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from load_data import load_test_data, load_train_data from data_cleaning import clean_data # laod data train_data = clean_data(load_train_data()) train_data.drop(['PassengerId'], axis=1, inplace=True) test_data = clean_data(load_test_data()) # split training data into training/testing sets train,test=train_test_split(train_data,test_size=0.3,random_state=0,stratify=train_data['Survived']) train_X=train[train.columns[1:]] train_Y=train[train.columns[:1]] test_X=test[test.columns[1:]] test_Y=test[test.columns[:1]] X=train_data[train_data.columns[1:]] Y=train_data['Survived'] # Hyper-Parameter Tuning for AdaBoost n_estimators=list(range(100,1100,100)) learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1] hyper={'n_estimators':n_estimators,'learning_rate':learn_rate} gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
import geopandas as gpd import numpy as np import matplotlib.pyplot as plt from copy import deepcopy import os from sklearn import preprocessing import GeoSpatialClustering as GSC from AuxMethods import AuxMethods as Aux from GeoSpatialPlot import GeoSpatialPlot as GSP from SpatialAutocorrelation import SpatialAutocorrelation as SAC import data_cleaning #%% read data if False: path = r'C:\Users\Vijeta\Documents\Projects\Sizanani\Data' dict_maps, dict_geo_df, df_clustering = data_cleaning.clean_data(path) #%% distribution and correlation if False: # create folder if not os.path.exists(os.path.join(path, 'Exploratory plots')): os.makedirs(os.path.join(path, 'Exploratory plots')) path_expl = os.path.join(path, 'Exploratory plots') f, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 12)) # Make the axes accessible with single indexing axs = axs.flatten() # Start the loop over all the variables of interest for i, col in enumerate(['support', 'CD4Results', 'MHIINDX3']): # select the axis where the map will go
def clean_chicago_business_data(self): self.logger.info(" Running cleaning steps on raw data") self.dataframe = clean_data(self.dataframe, self.data_cleaning) return self
dict_of_groupbys = {('nat', 'base_city_code'): [5,1], ('nat', 'c_asy_type', 'base_city_code'): [5,1], ('nat','langid', 'c_asy_type', 'base_city_code'): [5,1]c categoricals = ['nat', 'case_type', 'appl_code', 'c_asy_type', 'base_city_code', 'hearing_loc_code','attorney_flag', 'schedule_type', 'langid'] def cleaner(): # put the full path name to these files features_to_keep = './cleaning_data/features_to_keep.txt' # decision_scheduling_merge = './../data/dsmfc_short.csv' decision_scheduling_merge = './../data/decision_scheduling_merge_final_converted.csv' # clean the data print 'Cleaning data with courts_data' courts_data = clean_data(features_to_keep, decision_scheduling_merge) # make features print 'Making history features add_history_features_to_courts_data' make_history_features.add_history_features_to_courts_data(courts_data, dict_of_groupbys) courts_data.fillna(0, inplace=True) print courts_data.head(6) print 'Making dummy features make_dummies' courts_data = make_dummies.make_dummies(courts_data, categoricals) print 'Making time features make_hearing_edate_features, make_hearing_half_hour' make_time_features.make_hearing_edate_features(courts_data) make_time_features.make_hearing_half_hour(courts_data) courts_data.to_csv('./../data/cleaned_with_features.csv', index=False)
import matplotlib.pyplot as plt import data_cleaning as cleaner import basic_stats as stat ########################### # Read and clean data # ########################### # import original data and clean path_to_original_movies_file = '../data/movies.txt' path_to_original_data = '../data/data.txt' movies, duplicate_count, replace_table = \ cleaner.clean_movies(path_to_original_movies_file, save=True) data = cleaner.clean_data(replace_table, path_to_original_data, save_new_data='npy') # # or import cleaned data # path_to_clean_movies_file = '../data/movies_nodup.txt' # path_to_clean_data_file = '../data/data_clean.npy' # movies = cleaner.read_movie_as_dataframe(path_to_clean_movies_file) # data = np.load(path_to_clean_data_file) # create movie title-ID lookup dictionary id_title_dict = {} for index, row in movies.iterrows(): movie_id = row[0] movie_title = row[1] id_title_dict[movie_id] = movie_title