def min_temperature_on_rainy_days(filename): ''' This function should run a SQL query on a dataframe of weather data. More specifically you want to find the average minimum temperature on rainy days where the minimum temperature is greater than 55 degrees. You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be equal to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. You can see the weather data that we are passing in below: https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv ''' weather_data = pandas.read_csv(filename) q = """ select avg(mintempi) from weather_data where cast(rain as integer) = 1 and cast(mintempi as integer) > 55 """ #Execute your SQL command against the pandas frame avg_min_temp_rainy = pandasql.sqldf(q.lower(), locals()) return avg_min_temp_rainy #Execute your SQL command against the pandas frame mean_temp_weekends = pandasql.sqldf(q.lower(), locals()) return mean_temp_weekends
def test_noleak_legacy(db_uri): df = pd.DataFrame({'x': [1]}) result = sqldf("SELECT * FROM df", db_uri=db_uri) pdtest.assert_frame_equal(df, result) del df with pytest.raises(PandaSQLException): result = sqldf("SELECT * FROM df", db_uri=db_uri)
def test_query_with_spacing(self): df = pd.DataFrame({ "letter_pos": [i for i in range(len(string.letters))], "l2": list(string.letters) }) df2 = pd.DataFrame({ "letter_pos": [i for i in range(len(string.letters))], "letter": list(string.letters) }) result = sqldf("SELECT a.*, b.letter FROM df a INNER JOIN df2 b ON a.l2 = b.letter LIMIT 20;", locals()) self.assertEquals(len(result), 20) q = """ SELECT a.* FROM df a INNER JOIN df2 b on a.l2 = b.letter LIMIT 20 ;""" result = sqldf(q, locals()) self.assertEquals(len(result), 20)
def count(self): eventsdb = self.eventsdb eventsdb_sorted = sqldf('SELECT * from eventsdb ORDER BY id',locals()) numevents = len(eventsdb) count = [i for i in range(0,numevents)] eventsdb_sorted['count'] = count self.eventsdb = sqldf('SELECT * from eventsdb_sorted ORDER BY id',locals()) self.eventsdb_subset = self.eventsdb
def survival_probability(self): eventsdb = self.eventsdb eventsdb_sorted = sqldf('SELECT * from eventsdb ORDER BY duration_us',locals()) numevents = len(eventsdb) survival = [1.0 - float(i)/float(numevents) for i in range(0,numevents)] eventsdb_sorted['survival_probability'] = survival self.eventsdb = sqldf('SELECT * from eventsdb_sorted ORDER BY id',locals()) self.eventsdb_subset = self.eventsdb
def delay_probability(self): eventsdb = self.eventsdb eventsdb_sorted = sqldf('SELECT * from eventsdb ORDER BY event_delay_s',locals()) numevents = len(eventsdb) delay = [1.0 - float(i)/float(numevents) for i in range(0,numevents)] eventsdb_sorted['delay_probability'] = delay self.eventsdb = sqldf('SELECT * from eventsdb_sorted ORDER BY id',locals()) self.eventsdb_subset = self.eventsdb
def entries_time_period_bar(turnstile_weather): # add new fields turnstile_weather['dayofweek'] = pd.DatetimeIndex(turnstile_weather['DATEn']).dayofweek turnstile_weather['day'] = pd.DatetimeIndex(turnstile_weather['DATEn']).day turnstile_weather['weekday'] = 0 turnstile_weather.loc[turnstile_weather['dayofweek'] <= 4, 'weekday'] = 1 q = """ select * ,case when Hour <=5 then 1 when weekday = 0 and Hour >= 6 then 2 when weekday = 1 and Hour between 10 and 14 then 3 when weekday = 1 and Hour >= 20 then 4 when weekday = 1 and Hour between 6 and 9 then 5 when weekday = 1 and Hour between 15 and 19 then 5 else 0 end as timeperiod from turnstile_weather """ #Execute your SQL command against the pandas frame turnstile_weather2 = pandasql.sqldf(q.lower(), locals()) q2 = """ select time_period,sum(ENTRIESn_hourly) as ENTRIESn_hourly from turnstile_weather2 group by time_period """ #Execute your SQL command against the pandas frame entries_time_period = pandasql.sqldf(q2.lower(), locals()) plt.figure() plt.title('Turnstile Entries by Time Period') plt.ylabel('Turnstile Entries (in millions)') plt.xlabel('Time Period') y = entries_time_period['ENTRIESn_hourly']/1000000 x = entries_time_period['time_period'] labels = ['Late Night','Weekends','Midday','Evening','Rush Hour'] plt.xticks(x, labels) plt.xlim(0,6) plt.ylim(0,25) plt.bar(x, y,width=0.25,align='center',color='DodgerBlue') #print entries_time_period return plt
def compare_averages(filename): """ Performs a t-test on two sets of baseball data (left-handed and right-handed hitters). You will be given a csv file that has three columns. A player's name, handedness (L for lefthanded or R for righthanded) and their career batting average (called 'avg'). You can look at the csv file via the following link: https://www.dropbox.com/s/xcn0u2uxm8c4n6l/baseball_data.csv Write a function that will read that the csv file into a pandas data frame, and run Welch's t-test on the two cohorts defined by handedness. One cohort should be a data frame of right-handed batters. And the other cohort should be a data frame of left-handed batters. We have included the scipy.stats library to help you write or implement Welch's t-test: http://docs.scipy.org/doc/scipy/reference/stats.html With a significance level of 95%, if there is no difference between the two cohorts, return a tuple consisting of True, and then the tuple returned by scipy.stats.ttest. If there is a difference, return a tuple consisting of False, and then the tuple returned by scipy.stats.ttest. For example, the tuple that you return may look like: (True, (9.93570222, 0.000023)) """ theframe=pandas.read_csv(filename) turnstile_weather=theframe q="select * from theframe where rain = 1" #wasraining=theframe[theframe['rain']==1] wasraining=pandasql.sqldf(q.lower(), locals()) r ="select * from theframe where rain = 0" wasnotraining=pandasql.sqldf(r.lower(), locals()) #wasraining=theframe[['ENTRIESn_hourly']][theframe['rain']==1] #numpy.nan_to_num(wasraining) wasnotraining=theframe[['ENTRIESn_hourly']][theframe['rain']==0] #numpy.nan_to_num(wasnotraining) result=scipy.stats.mannwhitneyu(turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==1], turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==0]) #result=scipy.stats.mannwhitneyu(wasraining['ENTRIESn_hourly'],wasnotraining['ENTRIESn_hourly']) #result=scipy.stats.ttest_ind(wasraining['ENTRIESn_hourly'],wasnotraining['ENTRIESn_hourly'],equal_var=False) print len(wasraining.index) print len(wasnotraining.index) return result
def nentries_mean(df,col_name): """ mean of the total number of entries for all units for each hour or day """ # sum of the number of entries per hour for all units q = 'SELECT ' + col_name + ', daten, SUM(entriesperhr) AS totentries FROM df GROUP BY ' + col_name + ', daten;' df = pandasql.sqldf(q.lower(), locals()) #mean of the total number of entries for all unit q = 'SELECT ' + col_name + ', AVG(totentries) AS avgentries FROM df GROUP BY ' + col_name + ';' df = pandasql.sqldf(q.lower(), locals()) return df
def num_rainy_days(filename): ''' Function returning various sql statements regarding this weather data: https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv ''' weather_data = pandas.read_csv(filename) rain_count = """ SELECT count(rain) FROM weather_data WHERE rain = 1; """ maxTemp_fog = """ SELECT fog, max(maxtempi) FROM weather_data WHERE fog = 0 UNION SELECT fog, max(maxtempi) FROM weather_data WHERE fog = 1; """ meanTemp_weekend = """ select avg(cast (meantempi as integer)) from weather_data where cast (strftime('%w', date) as integer) in (0, 6); """ avgMinTemp_rain = """ select avg(mintempi) from weather_data where rain = 1 and mintempi > 55; """ #Execute one of the SQL command against the pandas frame rainy_count = pandasql.sqldf(q1.lower(), locals()) return rainy_count
def entries_day_line(turnstile_weather): # Add Time Period turnstile_weather = time_period(turnstile_weather) q = """ select day_week as Day_Week,sum(ENTRIESn_hourly) as ENTRIESn_hourly from turnstile_weather group by day_week """ #Execute your SQL command against the pandas frame entries_day = pandasql.sqldf(q, locals()) plt.figure() plt.title('Turnstile Entries by Day (5/1/2011 - 5/31/2011)') plt.ylabel('Turnstile Entries (in millions)') plt.xlabel('Day of Week') y = entries_day['ENTRIESn_hourly']/1000000 x = entries_day['Day_Week'] labels = ['Mon','Tue','Wed','Thur','Fri', 'Sat', 'Sun'] plt.xticks(x, labels) plt.xlim(-1,7) plt.ylim(0,25) plt.plot(x,y,marker='.',linestyle='--') #print entries_day return plt
def aggregate_query(filename): aadhaar_data = pandas.read_csv(filename) aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True) # Write a query that will select from the aadhaar_data table how # many men and how many women over the age of 50 have had # aadhaar generated for them in each district # # The possible columns to select from aadhaar data are: # 1) Registrar # 2) Enrolment Agency # 3) State # 4) District # 5) Sub District # 6) Pin Code # 7) Gender # 8) Age # 9) Aadhaar generated # 10) Enrolment Rejected # 11) Residents providing email, # 12) Residents providing mobile number # # You can download a copy of the aadhaar data that we are passing # into this exercise below: # https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv q = """ select gender, district, sum(aadhaar_generated) from aadhaar_data where age > 50 group by gender, district """ aadhaar_solution = pandasql.sqldf(q.lower(), locals()) return aadhaar_solution
def avg_min_temperature(filename): weather_data = pandas.read_csv(filename) q = """ SELECT avg(mintempi) FROM weather_data WHERE mintempi > 55 AND rain = 1; """ avg_min_temp_rainy = pandasql.sqldf(q.lower(), locals()) return avg_min_temp_rainy
def select_first_50(filename): # Read in our aadhaar_data csv to a pandas dataframe. Afterwards, we rename the columns # by replacing spaces with underscores and setting all characters to lowercase, so the # column names more closely resemble columns names one might find in a table. aadhaar_data = pandas.read_csv(filename) aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True) # Select out the first 50 values for "registrar" and "enrolment_agency" # in the aadhaar_data table using SQL syntax. # # Note that "enrolment_agency" is spelled with one l. Also, the order # of the select does matter. Make sure you select registrar then enrolment agency # in your query. q = """ -- YOUR QUERY HERE SELECT registrar, enrolment_agency FROM aadhaar_data LIMIT 50; """ #Execute your SQL command against the pandas frame aadhaar_solution = pandasql.sqldf(q.lower(), locals()) return aadhaar_solution
def get_turnstile_samples(filename): df = pandas.read_csv(filename) query = """SELECT DATEn, sum(ENTRIESn_hourly) as entries, rain FROM df GROUP BY DATEn""" samples = pandasql.sqldf(query.lower(), locals()) return samples
def max_temp_aggregate_by_fog(filename): weather_data = pandas.read_csv(filename) q = """ SELECT fog, MAX(maxtempi) FROM weather_data GROUP BY fog; """ foggy_days = pandasql.sqldf(q.lower(), locals()) return foggy_days
def avg_min_temperature(filename): ''' This function should run a SQL query on a dataframe of weather data. More specifically you want to find the average minimum temperature (mintempi column of the weather dataframe) on rainy days where the minimum temperature is greater than 55 degrees. You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. You can see the weather data that we are passing in below: https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv ''' weather_data = pandas.read_csv(filename) q = """ SELECT AVG(mintempi) FROM weather_data WHERE rain = 1 AND cast(mintempi as integer) > 55 """ #Execute your SQL command against the pandas frame avg_min_temp_rainy = pandasql.sqldf(q.lower(), locals()) return avg_min_temp_rainy
def num_rainy_days(filename): ''' This function should run a SQL query on a dataframe of weather data. The SQL query should return one column and one row - a count of the number of days in the dataframe where the rain column is equal to 1 (i.e., the number of days it rained). The dataframe will be titled 'weather_data'. You'll need to provide the SQL query. You might find SQL's count function useful for this exercise. You can read more about it here: https://dev.mysql.com/doc/refman/5.1/en/counting-rows.html You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. You can see the weather data that we are passing in below: https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv ''' weather_data = pandas.read_csv("/Users/Elaine/Documents/Intro-to-Data-Science/core/weather_underground.csv") q = """ SELECT Count(date) FROM weather_data WHERE rain == 1 """ #Execute your SQL command against the pandas frame rainy_days = pandasql.sqldf(q.lower(), locals()) return rainy_days
def filter_by_regular(filename): ''' This function should read the csv file located at filename into a pandas dataframe, and filter the dataframe to only rows where the 'DESCn' column has the value 'REGULAR'. For example, if the pandas dataframe is as follows: ,C/A,UNIT,SCP,DATEn,TIMEn,DESCn,ENTRIESn,EXITSn 0,A002,R051,02-00-00,05-01-11,00:00:00,REGULAR,3144312,1088151 1,A002,R051,02-00-00,05-01-11,04:00:00,DOOR,3144335,1088159 2,A002,R051,02-00-00,05-01-11,08:00:00,REGULAR,3144353,1088177 3,A002,R051,02-00-00,05-01-11,12:00:00,DOOR,3144424,1088231 The dataframe will look like below after filtering to only rows where DESCn column has the value 'REGULAR': 0,A002,R051,02-00-00,05-01-11,00:00:00,REGULAR,3144312,1088151 2,A002,R051,02-00-00,05-01-11,08:00:00,REGULAR,3144353,1088177 ''' turnstile_data = pandas.read_csv(filename) q = """ select * from turnstile_data where DESCn like 'REGULAR' """ #Execute your SQL command against the pandas frame turnstile_data_results = pandasql.sqldf(q.lower(), locals()) #print turnstile_data_results return turnstile_data_results
def num_rainy_days(filename): weather_data = pandas.read_csv(filename, na_values='') q = ''' SELECT count(rain) AS weather_data from weather_data where rain = 1; ''' rainy_days = pandasql.sqldf(q.lower(), locals()) return rainy_days
def max_temp_aggregate_by_fog(filename): """ This function should run a SQL query on a dataframe of weather data. The SQL query should return two columns and two rows - whether it was foggy or not (0 or 1) and the max maxtempi for that fog value (i.e., the maximum max temperature for both foggy and non-foggy days). The dataframe will be titled 'weather_data'. You'll need to provide the SQL query. You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. You can see the weather data that we are passing in below: https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv """ weather_data = pandas.read_csv(filename) q = "SELECT fog, MAX(cast (maxtempi as integer)) FROM weather_data WHERE cast(fog as integer) = 1 UNION SELECT fog, MAX(cast (maxtempi as integer)) FROM weather_data WHERE cast(fog as integer) = 0" # Execute your SQL command against the pandas frame rainy_days = pandasql.sqldf(q.lower(), locals()) return rainy_days
def num_rainy_days(filename): ''' This function should run a SQL query on a dataframe of weather data. The SQL query should return one column and one row - a count of the number of days in the dataframe where the rain column is equal to 1 (i.e., the number of days it rained). The dataframe will be titled 'weather_data'. You'll need to provide the SQL query. You might find SQL's count function useful for this exercise. You can read more about it here: https://dev.mysql.com/doc/refman/5.1/en/counting-rows.html You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. You can see the weather data that we are passing in below: https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv ''' weather_data = pandas.read_csv(filename) q = """ select count(*) from weather_data where rain = 1 """ #Execute your SQL command against the pandas frame rainy_days = pandasql.sqldf(q.lower(), locals()) return rainy_days
def avg_weekend_temperature(filename): ''' The SQL query should return one column and one row - the average meantempi on days that are a Saturday or Sunday (i.e., the the average mean temperature on weekends). The dataframe will be titled 'weather_data' and you can access the date in the dataframe via the 'date' column. Also, you can convert dates to days of the week via the 'strftime' keyword in SQL. For example, cast (strftime('%w', date) as integer) will return 0 if the date is a Sunday or 6 if the date is a Saturday. ''' weather_data = pandas.read_csv(filename) #print weather_data.columns.values q = """ SELECT avg(cast(meantempi as integer)) FROM weather_data WHERE cast(strftime('%w', date) as integrer) == 0 or cast(strftime('%w', date) as integer) == 6 """ #Execute your SQL command against the pandas frame mean_temp_weekends = pandasql.sqldf(q.lower(), locals()) return mean_temp_weekends
def aggregate_query(filename): # Read in our aadhaar_data csv to a pandas dataframe. Afterwards, we rename the columns # by replacing spaces with underscores and setting all characters to lowercase, so the # column names more closely resemble columns names one might find in a table. aadhaar_data = pandas.read_csv(filename) aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True) # Write a query that will select from the aadhaar_data table how many men and how # many women over the age of 50 have had aadhaar generated for them in each district. # aadhaar_generated is a column in the Aadhaar Data that denotes the number who have had # aadhaar generated in each row of the table. # q = """ SELECT gender, district, sum(aadhaar_generated) FROM aadhaar_data WHERE age > 50 GROUP BY gender, district """ # Execute your SQL command against the pandas frame aadhaar_solution = pandasql.sqldf(q.lower(), locals()) return aadhaar_solution
def select_first_50(filename): # Read in our aadhaar_data csv to a pandas dataframe. Afterwards, we rename the columns # by replacing spaces with underscores and setting all characters to lowercase, so the # column names more closely resemble columns names one might find in a table. aadhaar_data = pandas.read_csv(filename) aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True) # Select out the first 50 values for "registrar" and "enrolment_agency" # in the aadhaar_data table using SQL syntax. # # Note that "enrolment_agency" is spelled with one l. Also, the order # of the select does matter. Make sure you select registrar then enrolment agency # in your query. # # You can download a copy of the aadhaar data that we are passing # into this exercise below: # https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv q = """ select "registrar", "enrolment_agency" from aadhaar_data limit 50; """ #Execute your SQL command against the pandas frame aadhaar_solution = pandasql.sqldf(q.lower(), locals()) return aadhaar_solution
def avg_weekend_temperature(filename): ''' This function should run a SQL query on a dataframe of weather data. The SQL query should return one column and one row - the average meantempi on days that are a Saturday or Sunday (i.e., the the average mean temperature on weekends). The dataframe will be titled 'weather_data' and you can access the date in the dataframe via the 'date' column. You'll need to provide the SQL query. You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. Also, you can convert dates to days of the week via the 'strftime' keyword in SQL. For example, cast (strftime('%w', date) as integer) will return 0 if the date is a Sunday or 6 if the date is a Saturday. You can see the weather data that we are passing in below: https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv ''' weather_data = pandas.read_csv(filename) q = """ select avg(cast(meantempi as integer)) from weather_data where cast (strftime('%w', date) as integer) = 0 or cast(strftime('%w', date) as integer) = 6 """ #Execute your SQL command against the pandas frame mean_temp_weekends = pandasql.sqldf(q.lower(), locals()) return mean_temp_weekends
def num_rainy_days(filename): ''' This function should run a SQL query on a dataframe of weather data. The SQL query should return one column and one row - a count of the number of days in the dataframe where the rain column is equal to 1 (i.e., the number of days it rained). https://dev.mysql.com/doc/refman/5.1/en/counting-rows.html You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. ''' weather_data = pandas.read_csv(filename) #print weather_data.head() #print weather_data.columns.values #print numpy.sum(weather_data['rain'] > 0) q = """ SELECT COUNT(*) FROM weather_data WHERE rain == 1 """ #Execute your SQL command against the pandas frame rainy_days = pandasql.sqldf(q.lower(), locals()) return rainy_days
def max_temp_aggregate_by_fog(filename): ''' This function should run a SQL query on a dataframe of weather data. The SQL query should return two columns and two rows - whether it was foggy or not (0 or 1) and the max maxtempi for that fog value (i.e., the maximum max temperature for both foggy and non-foggy days). The dataframe will be titled 'weather_data'. You'll need to provide the SQL query. You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. You can see the weather data that we are passing in below: https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv ''' weather_data = pandas.read_csv(filename) q = """ SELECT fog, max(maxtempi) FROM weather_data GROUP BY fog; """ #Execute your SQL command against the pandas frame foggy_days = pandasql.sqldf(q.lower(), locals()) return foggy_days
def test_select(self): df = pd.DataFrame({ "letter_pos": [i for i in range(len(string.letters))], "l2": list(string.letters) }) result = sqldf("select * from df LIMIT 10;", locals()) self.assertEquals(len(result), 10)
def entries_histogram(df): # Sum entries by date and UNIT global daily_entries daily_entries = (df[['DATEn','UNIT','ENTRIESn_hourly']]. groupby(['DATEn','UNIT']).sum()) daily_entries = daily_entries.reset_index() # Group rain by date global daily_rain daily_rain = df[['DATEn','rain']].groupby('DATEn').mean() daily_rain = daily_rain.reset_index() daily_rain.loc[:,'rain'] = daily_rain['rain'].apply(lambda x: reduce(x)) # Join daily_entries and daily_rain tables on date from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) q = ('''SELECT e.DATEn, e.UNIT, e.ENTRIESn_hourly, p.rain FROM daily_entries e JOIN daily_rain p ON e.DATEn = p.DATEn;''') daily_entries = pysqldf(q) # Divide daily_entries into rain and no-rain tables daily_entries.loc[:, 'entries_log'] = (daily_entries['ENTRIESn_hourly']. apply(lambda x: take_log(x))) no_rain = daily_entries[daily_entries.rain==0] rain = daily_entries[daily_entries.rain==1] x = [no_rain['entries_log'], rain['entries_log']] # plot histogram plt.hist(x, range = (0, 16), bins = 23, color=['k','m'], label=["no rain","rain"]) plt.xlabel("log of ENTRIESn_hourly summed by date and remote unit") plt.ylabel("Frequency") legend = plt.legend() return plt
from sklearn import decomposition, preprocessing, cluster, metrics from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import confusion_matrix, roc_curve, auc from sklearn.model_selection import train_test_split import six from math import * # ///// Pour gérer un affichage plus joli que la fonction "print" ////// from IPython.display import display, Markdown, HTML, display_html def display_side_by_side(*args): html_str='' for df in args: html_str+=df.to_html() display_html(html_str.replace('table','table style="display:inline"'),raw=True) # ///// Pour executer des requetes SQL de verification sur des DF ///// from pandasql import sqldf execsql = lambda q: sqldf(q, globals()) # EXEMPLE D'UTILISATION # ---------------------- # req1 = ''' Select zone1, zone2 From DataFrame Where zone3=xx and zone4='xx' limit 3;''' # df1 = execsql(req1) # df1 import time # Limiter le nombre de lignes d'affichage des Dataframes pd.options.display.max_rows=20
def main(): #yelp_integrated_frame = integrate_restaurants_business() yelp_integrated_frame = fetch_biz_data() inspection_data = fetch_inspection_data() inspection_data = inspection_data[inspection_data['Results'].str.lower().str.contains('fail')] yelp_integrated_frame['DATE_ISSUED'] = pd.to_datetime(yelp_integrated_frame['DATE_ISSUED'], format='%m/%d/%Y') yelp_integrated_frame['LICENSE_STATUS_CHANGE_DATE'] = pd.to_datetime(yelp_integrated_frame['LICENSE_STATUS_CHANGE_DATE'], format='%m/%d/%Y') inspection_data['Inspection_Date'] = pd.to_datetime(inspection_data['Inspection_Date'], format='%m/%d/%Y') inspection_data_test= inspection_data.groupby(['DOING_BUSINESS_AS_NAME', 'ADDRESS'])['Inspection_Date'].max().reset_index() yelp_int_test = yelp_integrated_frame.groupby(['DOING_BUSINESS_AS_NAME', 'ADDRESS'])['DATE_ISSUED'].max().reset_index() integ = pd.merge(inspection_data_test,yelp_int_test, on=['DOING_BUSINESS_AS_NAME']) #integ = integ.sort_values('DOING_BUSINESS_AS_NAME') #pysqldf = lambda q: sqldf(q, globals()) new_df = pd.DataFrame() for dummy,index in integ.iterrows(): add_x = index['ADDRESS_x'] add_y = index['ADDRESS_y'] measure = similar(add_x, add_y) if measure >= 0.8 and index['Inspection_Date'].year <= 2014: index["diff"] = (index['Inspection_Date'] - index['DATE_ISSUED']).days new_df = new_df.append(index) new_df = new_df[new_df['diff'] > 750] new_df = new_df[['DOING_BUSINESS_AS_NAME','ADDRESS_x','Inspection_Date', 'diff']] new_df = new_df.groupby(['DOING_BUSINESS_AS_NAME','ADDRESS_x'])['Inspection_Date','diff'].max().reset_index() q = """SELECT * FROM integ WHERE integ.ADDRESS_x LIKE '%' || integ.ADDRESS_y || '%'""" dff = psql.sqldf(q, locals()) first_issued_lic_data = yelp_integrated_frame.groupby("DOING_BUSINESS_AS_NAME", as_index=False)["DATE_ISSUED"].max() yelp_integrated_frame = pd.merge(yelp_integrated_frame,first_issued_lic_data, on=['DOING_BUSINESS_AS_NAME','DOING_BUSINESS_AS_NAME', 'DATE_ISSUED','DATE_ISSUED',]) yelp_integrated_frame = yelp_integrated_frame.drop_duplicates() yelp_integrated_frame = yelp_integrated_frame[yelp_integrated_frame['LICENSE_STATUS'].str.lower().str.contains('aac|rev')] merged = pd.merge(inspection_data,yelp_integrated_frame, on=['LICENSE_ID']) merged['diff'] = 0 for dummy,index in merged.iterrows(): merged.at[dummy,"diff"] = (index['LICENSE_STATUS_CHANGE_DATE'] - index['DATE_ISSUED']).days merged = merged[['DOING_BUSINESS_AS_NAME_x','ADDRESS_x','Inspection_Date','diff']] merged = merged.groupby(['DOING_BUSINESS_AS_NAME_x','ADDRESS_x'])['Inspection_Date','diff'].max().reset_index() merged = merged.rename(columns={'DOING_BUSINESS_AS_NAME_x': 'DOING_BUSINESS_AS_NAME'}) final_merge = pd.concat([new_df, merged], ignore_index=True) final_merge = final_merge.groupby(['DOING_BUSINESS_AS_NAME','ADDRESS_x'])['Inspection_Date','diff'].max().reset_index() final_merge['diff'] = final_merge['diff'].apply(lambda x: float(x)/float(365)) final_merge = final_merge.rename(columns={'DOING_BUSINESS_AS_NAME': 'Restaurant Name', 'Inspection_Date':'Failed inspection on','ADDRESS':'Address','diff':'Alive for x years'}) final_merge.to_csv('biz_viability_out.csv', encoding='utf-8', index=False) print(yelp_integrated_frame) now = datetime.datetime.now() cur_year = now.year
'SEG_NO':"SegNo", 'SEG_LNGTH_FEET':"SegLenFt", "CUR_AADT":"CurAADT"}) SegInfoData.columns SegInfoData.head() # Join data based on the begin and end segments #************************************************************************************************************ sqlcode = ''' select Data1.ProjID, Data1.CountyCode, Data1.SR, Data1.BegSeg, Data1.BegOff, SegInfoData.SegNo, Data1.EndSeg from Data1 left join SegInfoData on SegInfoData.CountyCode = Data1.CountyCode and SegInfoData.SR = Data1.SR and SegInfoData.SegNo = Data1.BegSeg ''' TestDf = ps.sqldf(sqlcode,locals()) TestDf.isna().sum() TestDf.groupby(['ProjID','CountyCode','SR','BegSeg','BegOff'])['BegSeg'].first().shape sqlcode = ''' select Data1.ProjID, Data1.CountyCode, Data1.SR, Data1.BegSeg, SegInfoData.SegNo, Data1.EndSeg, SegInfoData.SegLenFt, Data1.BegOff, Data1.EndOff, SegInfoData.CurAADT, SegInfoData.X_VALUE_BGN, SegInfoData.Y_VALUE_BGN, SegInfoData.X_VALUE_END, SegInfoData.Y_VALUE_END from Data1 left join SegInfoData on SegInfoData.CountyCode = Data1.CountyCode and SegInfoData.SR = Data1.SR and SegInfoData.SegNo between Data1.BegSeg and Data1.EndSeg ''' NewDf = ps.sqldf(sqlcode,locals()) NewDf.isna().sum() NewDf.groupby(['ProjID','CountyCode','SR','BegSeg','BegOff'])['BegSeg'].first().shape
def corrupt(self, X): """ Note that here X is a pandas DataFrame """ # create new corrupted data Y = X.copy() # cast category to object Y[self.categories_dict.keys()] = Y[self.categories_dict.keys()].apply( lambda x: x.astype('object')) # get means and standard deviations # (will be used as noise for the numericas, does not distort statistics) means_col = X.mean() stds_col = X.std() # add auxiliary index X['indexcol'] = X.index # break the cCFDs in the list for ccfd in self.ccfds: ## Get Rows which hold the cCFD constraint cfd_cond_str = to_str_CFD_SQL(ccfd.LHS, ccfd.RHS) sql_query = "SELECT {} FROM {} WHERE {};".format( "indexcol", "X", cfd_cond_str) df_res = sqldf(sql_query, locals()) # Get categories and respective probabilities in the dataset, # if ccfd.RHS[0] feature is categorical if X[ccfd.RHS[0]].dtype.name == 'category': cats = [ t for t in self.categories_dict[ccfd.RHS[0]] if t != ccfd.RHS[1] ] cats_probs = X[ccfd.RHS[0]].value_counts()[cats].values cats_probs = cats_probs / float(cats_probs.sum()) ## Insert Right Hand Side Noise (to violate the constraint) for row_idx in df_res['indexcol']: if numpy.random.rand() <= self.p: # is categorical if X[ccfd.RHS[0]].dtype.name == 'category': # choose other categories according to their proportion in the dataset idx_cat = numpy.random.choice(len(cats), 1, False, cats_probs)[0] Y.set_value(row_idx, ccfd.RHS[0], cats[idx_cat]) # is integer elif X[ccfd.RHS[0]].dtype.name in [ 'int16', 'int32', 'int64' ]: # noise the cell using the mean of column (with a fraction of the standard deviation) Y.set_value( row_idx, ccfd.RHS[0], int(means_col[ccfd.RHS[0]] + 0.10 * stds_col[ccfd.RHS[0]])) # is float elif X[ccfd.RHS[0]].dtype.name in [ 'float16', 'float32', 'float64' ]: # noise the cell using the mean of column (with a fraction of the standard deviation) Y.set_value( row_idx, ccfd.RHS[0], float(means_col[ccfd.RHS[0]] + 0.10 * stds_col[ccfd.RHS[0]])) # Add Typo if none of above else: # noise the cell using standard typo (e.g. unique/rare) Y.set_value(row_idx, ccfd.RHS[0], "*" + ccfd.RHS[1] + "*") #Testing #for ccfd in self.ccfds: # # Get Rows which hold the cCFD constraint # cfd_cond_str = to_str_CFD_SQL(ccfd.LHS, ccfd.RHS) # sql_query = "SELECT {} FROM {} WHERE {};".format("count(*)", "Y", cfd_cond_str) # df_res = sqldf(sql_query, locals()) # print df_res # drop auxiliary index X.drop('indexcol', axis=1, inplace=True) return Y
def link_to_secid(df): """ df should contain columns date and permno to get the match returns the same data frame with added column for OM secid """ # Manually reading optionmetrics-crsp linking suite since there is # no dataset to download this from WRDS oclink = pd.read_csv( "estimated_data/crsp_data/optionmetrics_crsp_link.csv") # Getting the best link for each month end oclink = oclink[oclink.score < 6] oclink["sdate"] = [str(int(x)) for x in oclink["sdate"]] oclink["sdate"] = pd.to_datetime(oclink["sdate"], format="%Y%m%d") oclink["edate"] = [str(int(x)) for x in oclink["edate"]] oclink["edate"] = pd.to_datetime(oclink["edate"], format="%Y%m%d") q1 = """ select d.*, s1.secid as secid_1, s2.secid as secid_2, s3.secid as secid_3, s4.secid as secid_4, s5.secid as secid_5 from df as d left join (select secid, permno, sdate, edate from oclink where score = 1) as s1 on d.permno = s1.permno and d.date >= s1.sdate and d.date <= s1.edate left join (select secid, permno, sdate, edate from oclink where score = 2) as s2 on d.permno = s2.permno and d.date >= s2.sdate and d.date <= s2.edate left join (select secid, permno, sdate, edate from oclink where score = 3) as s3 on d.permno = s3.permno and d.date >= s3.sdate and d.date <= s3.edate left join (select secid, permno, sdate, edate from oclink where score = 4) as s4 on d.permno = s4.permno and d.date >= s4.sdate and d.date <= s4.edate left join (select secid, permno, sdate, edate from oclink where score = 5) as s5 on d.permno = s5.permno and d.date >= s5.sdate and d.date <= s5.edate """ tmp = sqldf(q1, locals()) # Filtering and providing the best match: q2 = """ select *, COALESCE(secid_1, secid_2, secid_3, secid_4, secid_5) as secid from tmp """ df = sqldf(q2, locals()) df = df.drop( columns=["secid_1", "secid_2", "secid_3", "secid_4", "secid_5"]) # Converting date columns to date format: df["date"] = pd.to_datetime(df["date"]) return df
def update_trace(self): self.load_mapped_data() self.filtered_data = self.data self.plot_data = self.filtered_data plot_samplerate = self.samplerate if self.cutoff_entry.get() != '' and self.order_entry != '': self.filter_data() self.plot_data = self.filtered_data if self.downsample_entry.get() != '': self.downsample_data() self.plot_data = self.downsampled_data plot_samplerate = float(self.downsample_entry.get()) self.trace_fig.clf() a = self.trace_fig.add_subplot(111) if self.events_flag: db = self.ratefile start_time = self.start_time end_time = self.end_time good_start = np.squeeze( sqldf( 'SELECT start_time_s from db WHERE start_time_s >= {0} AND start_time_s < {1} AND type IN (0,1)' .format(start_time, end_time), locals()).values) * 1e6 bad_start = np.squeeze( sqldf( 'SELECT start_time_s from db WHERE start_time_s >= {0} AND start_time_s < {1} AND type>1' .format(start_time, end_time), locals()).values) * 1e6 good_end = np.squeeze( sqldf( 'SELECT end_time_s from db WHERE end_time_s >= {0} AND end_time_s < {1} AND type IN (0,1)' .format(start_time, end_time), locals()).values) * 1e6 bad_end = np.squeeze( sqldf( 'SELECT end_time_s from db WHERE end_time_s >= {0} AND end_time_s < {1} AND type>1' .format(start_time, end_time), locals()).values) * 1e6 for gs, ge in zip(np.atleast_1d(good_start), np.atleast_1d(good_end)): a.axvspan(gs, ge, color='g', alpha=0.3) for bs, be in zip(np.atleast_1d(bad_start), np.atleast_1d(bad_end)): a.axvspan(bs, be, color='r', alpha=0.3) time = np.linspace(1.0 / plot_samplerate, len(self.plot_data) / plot_samplerate, len(self.plot_data)) + self.start_time a.set_xlabel(r'Time ($\mu s$)') a.set_ylabel('Current (pA)') self.trace_fig.subplots_adjust(bottom=0.14, left=0.21) a.plot(time * 1e6, self.plot_data, '.', markersize=1) if self.baseline_flag: if self.config_cutoff != int( self.cutoff_entry.get()) or self.config_order != int( self.order_entry.get()): self.wildcard.set( 'Filter settings in config file do not match plotting filter settings, overlay will be inaccurate' ) db = self.baseline_file start_time = self.start_time end_time = self.end_time times = np.squeeze(sqldf('SELECT time_s from db', locals()).values) times = np.sort(times) start_block = times[0] for time in times: if time <= start_time and time >= start_block: start_block = time baseline_db = sqldf( 'SELECT * from db WHERE time_s >= {0} and time_s < {1}'.format( start_block, end_time), locals()) times = baseline_db['time_s'].values means = baseline_db['baseline_pA'].values stdevs = baseline_db['stdev_pA'].values numblocks = len(means) for i in range(numblocks): if i == 0: xmin = start_time else: xmin = times[i] if i + 1 == numblocks: xmax = end_time else: xmax = times[i + 1] sign = np.sign(means[i]) a.plot( (xmin * 1e6, xmax * 1e6), (means[i] - sign * (self.threshold - self.hysteresis) * stdevs[i], means[i] - sign * (self.threshold - self.hysteresis) * stdevs[i]), '--', color='y') a.plot((xmin * 1e6, xmax * 1e6), (means[i] - sign * self.threshold * stdevs[i], means[i] - sign * self.threshold * stdevs[i]), '--', color='y') a.plot((xmin * 1e6, xmax * 1e6), (means[i], means[i]), '--', color='black') self.trace_canvas.show()
x_normd = Norm_dist(x5) Out_Reports_norm = x_normd[0] x6 = x_normd[1] PAgr = pd.merge(Profile_data, Out_Reports_norm, how='left', left_on=['Item_Name'], right_on=['Item_Name']) ###########################################Pagr table updation###################################### x6['updated'] = 0 Imputation = sqldf( """SELECT Item_Name,count(*) as Miss_days FROM x6 where Original is NULL group by Item_Name""", locals()) PAgr = pd.merge(PAgr, Imputation, how='left', left_on=['Item_Name'], right_on=['Item_Name']) # if PAgr.isnull().values.any( ): # if any value having na in Pagr we are conver that value to 0 I = PAgr['Miss_days'].index[PAgr['Miss_days'].apply(np.isnan)] PAgr['Miss_days'][I.values] = 0 PAgr['Act_AfterImput'] = PAgr["Total_" + forecastMessure] + PAgr['Miss_days'].astype(int) PAgr['PcImputation'] = round(PAgr.Miss_days / PAgr.Act_AfterImput, 2)
import pandasql as sql df=pd.read_csv('https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/aadhaar_data.csv') df.rename(columns = lambda x : x.replace(' ','_').lower(),inplace=True) ####Above command replace the colum name ex: enrolment agency to enrolment_agency query="""select state, sum(Aadhaar_generated) from df group by state limit 50 ; """ query1="""select sum(case when gender='M' then 1 else 0 end) as male, sum(case when gender='F' then 1 else 0 end) as fem from df where age > 50 ; """ query2 = """ select gender ,sum(Aadhaar_generated) from df where age > 50 group by gender ; """ print sql.sqldf(query,locals()) print sql.sqldf(query1,locals()) print sql.sqldf(query2,locals())
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns) tr['Cart'] = pd.cut( tr.Carats, bins=[0, 0.5, 1, 1.5, 2, 3, 4, 5, 15], labels=['0_0.5', '0.5_1', '1-1.5', '1.5-2', '2-3', '3-4', '4-5', '5+'], include_lowest=True) final = pd.crosstab(tr.Vendor, tr.Cart, tr.Price, aggfunc="mean") final final2 = pd.crosstab(tr.Cert, tr.Cart, tr.Profit_flg, aggfunc="sum") final2 q = """SELECT Cert , Profit_flg ,avg(Profit) as Avg_Profit ,count(0) as freq FROM tr group by Cert , Profit_flg;""" print(ps.sqldf(q, locals())) q = """SELECT Vendor , Cart ,avg(Price) as Avg_Pr ,count(0) as freq FROM tr group by Vendor, Cart;""" print(ps.sqldf(q, locals())) q = """SELECT Shape ,count(0) as count FROM tr group by Shape;""" print(ps.sqldf(q, locals())) #remove white space tr['Shape1'] = tr['Shape'].str.replace(r'\s+', '') tr['Shape2'] = pd.np.where( tr.Shape1.str.contains('Marquis|Marwuise'), 'Marquise', pd.np.where(tr.Shape1.str.contains('ROUND'), 'Round', tr['Shape1'])) tr['Shape'] = tr['Shape2'] del (tr['Shape1'], tr['Shape2'])
import pandas as pd import numpy as np from numpy import unique from datetime import datetime import pymysql pymysql.install_as_MySQLdb() import MySQLdb import pandas.io.sql as psql import sqlalchemy as sq from sqlalchemy import create_engine from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype import re from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) from time import gmtime, strftime import datetime db = pymysql.connect('34.214.211.162', 'mobiloansteam', 'team123456', 'mobiloans') cur = db.cursor(pymysql.cursors.DictCursor) UserId = 'mobiloansteam' auto_dialer = pd.read_sql( "SELECT * FROM mobiloans_auto_dialer where date >= '20180306' and date <= '20180405' ", con=db) manual_dialer = pd.read_sql( "SELECT * FROM mobiloans_manual_dialer where date >= '20180306' and date <= '20180405' ", db) sql_payment = pd.read_sql( "select loan_number AS AccountNumber,count(transaction_amount) AS Count_payment,sum(transaction_amount) AS payment FROM mobiloans.mobiloans_payment_file WHERE transaction_type_description='Payment' and transaction_effective_date >= '2018-03-06' and transaction_effective_date <= '2018-04-05' GROUP BY loan_number ",
def print_result(): # Execute your SQL command against the pandas frame rainy_days = pandasql.sqldf(q.lower(), globals()) print(rainy_days)
# encoding=utf-8 """常用的sql""" from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StringType, IntegerType, LongType, StructType import pandasql import numpy as np import pandas as pd import os schema = [("name", ""), ("age", 0.0)] data = [("a", 1), ("b", 2)] pysqldf = lambda sql: pandasql.sqldf(sql, globals()) # sql 查询引擎 test_df = pd.DataFrame(np.array([[1,2], [3,4]])) test_df.columns = ["a", "b"] pysqldf("select * from test_df") # test_df为df的变量名称 def _build_field(name_type): type_map = { str: StructField(name_type[0], StringType(), True), int: StructField(name_type[0], IntegerType(), True), float: StructField(name_type[0], LongType(), True) } return type_map.get(type(name_type[1])) def build_df_by_schema(rdd, schemas, tb_name=None): spark = SQL.spark df_schema = StructType([_build_field(x) for x in schemas]) df_rdd = spark.createDataFrame(rdd, df_schema) if tb_name:
labels=dict_labels, title="Fig 2: Curva de contagios en Punta Arenas") fig_2.update_layout(showlegend=False) data_g3 = data_comunas[(filt_reg) & (filt_not_pa) ] \ .groupby(["fecha", "region", "comuna"])["casos"] \ .sum() \ .reset_index() fig_3 = px.line(data_g3, x="fecha", y="casos", color="comuna", labels=dict_labels, title="Fig 3: Curva de contagios fuera de Punta Arenas") fig_3.update_layout(showlegend=True) p_4 = "Quizás, más ilustrativo sea ver la velocidad de aumento de casos en las comunas de la región:" comunas_unique = ps.sqldf("SELECT DISTINCT comuna FROM data_comunas WHERE region = 'Magallanes'") \ .to_dict(orient = "records") data_g4 = pd.DataFrame() for comuna in data_comunas[filt_reg].comuna.unique(): data_tmp = data_comunas[(filt_reg) & (data_comunas.comuna == comuna)] \ .groupby(["fecha", "region", "comuna"])["casos"] \ .sum() \ .diff(periods = 1) \ .reset_index() data_g4 = data_g4.append(data_tmp) del data_tmp fig_4 = px.line( data_g4, x="fecha", y="casos", color="comuna", labels=dict_labels,
def deaths(): df = pd.read_csv(deaths_path) return sqldf("SELECT * FROM df").to_json()
def cases(): df = pd.read_csv(cases_path) return sqldf("SELECT * FROM df").to_json()
This function should run a SQL query on a dataframe of weather data. The SQL query should return two columns and two rows - whether it was foggy or not (0 or 1) and the max maxtempi for that fog value (i.e., the maximum max temperature for both foggy and non-foggy days). The dataframe will be titled 'weather_data'. You'll need to provide the SQL query. You might also find that interpreting numbers as integers or floats may not work initially. In order to get around this issue, it may be useful to cast these numbers as integers. This can be done by writing cast(column as integer). So for example, if we wanted to cast the maxtempi column as an integer, we would actually write something like where cast(maxtempi as integer) = 76, as opposed to simply where maxtempi = 76. You can see the weather data that we are passing in below: https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv ''' import pandas import pandasql filename = 'A:\DataScience\Intro_to_DS\weather-underground.csv' weather_data = pandas.read_csv(filename) q = """ SELECT fog, MAX (maxtempi) FROM weather_data GROUP by fog """ #Execute your SQL command against the pandas frame foggy_days = pandasql.sqldf(q.lower(), locals()) print(foggy_days)
def load_data(): # This needs to be migrated, perhaps AWS configuration manager? host = "localhost" dbname = "test" user = "******" password = "******" conn = pymysql.connect(host, user=user, passwd=password, db=dbname) logger = get_logger("PreProcessing") logger.info("Loading Data") # Create variable for pandasql pysqldf = lambda q: sqldf(q, globals()) # start = timeit.default_timer() # cursor = conn.cursor() # cursor.execute('select * from matches'); # rows = cursor.fetchall() # matches = pd.DataFrame( [[ij for ij in i] for i in rows] ) # stop = timeit.default_timer() # # print ("Cursor execute") # print(stop - start) cursor = conn.cursor() start = timeit.default_timer() # #---------------------------------------------- # # Load users table, remove admin type users logger.info("Loading User Data") # Users can be associated to more than one community, if adding communities here users will have duplicated values # when using users later on, drop community column and drop duplicates # users = pd.read_sql('SELECT user_id, first_name, last_name, commute_mode, neighborhood, created_at as registration_date,\ # main_email, \ # CASE WHEN (user_id not in(select driver_id from rides)\ # AND user_id not in \ # (select user_id from passengers) ) THEN "REGISTERED"\ # ELSE "ACTIVE" END AS user_type FROM users \ # WHERE type != "admin" AND validated_email=1 AND main_email NOT IN \ # (SELECT main_email FROM (\ # SELECT main_email , count(user_id) as dup FROM users GROUP BY 1 \ # HAVING dup>1) as a);', con=conn) cursor.execute(""" SELECT a.user_id, a.first_name, a.last_name, a.commute_mode, a.neighborhood, a.created_at as date, a.main_email, c.name as community, CASE WHEN (a.user_id not in(select driver_id from rides) AND a.user_id not in (select user_id from passengers) ) THEN "REGISTERED" ELSE "ACTIVE" END AS user_type FROM users as a LEFT JOIN user_communities as b ON a.user_id = b.user_id LEFT JOIN communities as c ON b.community_id = c.id WHERE type != "admin" AND validated_email=1 AND main_email NOT IN (SELECT main_email FROM ( SELECT main_email , count(user_id) as dup FROM users GROUP BY 1 HAVING dup>1) as a); """) rows = cursor.fetchall() users = pd.DataFrame([[ij for ij in i] for i in rows]) users.columns = [ 'user_id', 'first_name', 'last_name', 'commute_mode', 'neighborhood', 'date', 'main_email', 'community', 'user_type' ] # add new id users["new_id"] = users.index + 1 users['date'] = pd.to_datetime(users['date']) users['reg_date_ym'] = users.date.dt.to_period('M') users.index = users['date'] # #----------------------------------------------- # Look for duplicated users, remove from further queries, most likely these are all admin dup_users1 = pd.read_sql('SELECT user_id FROM users \ WHERE main_email IN \ (SELECT main_email FROM (\ SELECT main_email , count(user_id) as dup FROM users GROUP BY 1 \ HAVING dup>1) as a);', con=conn) dup_users = dup_users1['user_id'].values.tolist() format_strings = ','.join(['%s'] * len(dup_users)) # #------------------------------------------------ # # Create the matches table, join with the rides, passengers and community # logger.info("Loading Matches Data") logger.info("Loading Matches Data") cursor.execute( """ SELECT d.name as community, a.ride_id, a.date, b.hour, a.created_at as publication_date, b.driver_id, c.user_id as passenger_id, coalesce(c.created_at, NULL) as match_date, b.type, b.seats, b.begin_location_gps, b.end_location_gps, b.distance_value, a.updated_at, c.updated_at AS pass_updated_at, YEAR(a.date) as ride_year, MONTH(a.date) as ride_month, WEEK(a.date) as ride_week, DAYOFWEEK(a.date) as ride_dow, DAY(a.date) as ride_day, HOUR(b.hour) as ride_hour FROM ride_dates AS a JOIN rides AS b ON a.ride_id = b.ride_id JOIN passengers as c ON a.ride_id = c.ride_id AND a.date = c.date JOIN communities as d ON b.community_id = d.id WHERE a.deleted_at IS NULL AND c.user_id not in (%s) """ % format_strings, tuple(dup_users)) rows = cursor.fetchall() matches = pd.DataFrame([[ij for ij in i] for i in rows]) # Add column names matches.columns = [ 'community', 'ride_id', 'date', 'hour', 'publication_date', 'driver_id', 'passenger_id', 'match_date', 'type', 'seats', 'begin_location_gps', 'end_location_gps', 'distance_value', 'updated_at', 'pass_updated_at', 'ride_year', 'ride_month', 'ride_week', 'ride_dow', 'ride_day', 'ride_hour' ] #Standarise date types matches['date'] = pd.to_datetime(matches['date']) matches['publication_date'] = pd.to_datetime(matches['publication_date']) matches['match_date'] = pd.to_datetime(matches['match_date']) matches.index = matches['date'] #df.resample('M').agg(dict(score='count')) #matches['ride'] = matches.date.dt.to_period('M') #matches['year_week'] = matches.date.dt.to_period('W') # # #------------------------------------------- # Get only valid and clean rides logger.info("Loading Rides Data") cursor.execute( """ SELECT d.name as community, a.ride_id, a.date, b.hour, a.created_at as publication_date, b.driver_id, b.type, b.seats, b.begin_location_gps, b.end_location_gps, b.distance_value, a.updated_at, YEAR(a.date) as ride_year, MONTH(a.date) as ride_year, WEEK(a.date) as ride_week, DAYOFWEEK(a.date) as ride_dow, DAY(a.date) as ride_day, HOUR(b.hour) as ride_hour FROM ride_dates AS a JOIN rides AS b ON a.ride_id = b.ride_id JOIN communities as d ON b.community_id = d.id WHERE a.deleted_at IS NULL AND b.driver_id not in (%s) """ % format_strings, tuple(dup_users)) rows = cursor.fetchall() rides = pd.DataFrame([[ij for ij in i] for i in rows]) # Add column names rides.columns = [ 'community', 'ride_id', 'date', 'hour', 'publication_date', 'driver_id', 'type', 'seats', 'begin_location_gps', 'end_location_gps', 'distance_value', 'updated_at', 'ride_year', 'ride_month', 'ride_week', 'ride_dow', 'ride_day', 'ride_hour' ] #Standarise date types rides['date'] = pd.to_datetime(rides['date']) rides.index = rides['date'] rides['year_month'] = rides.date.dt.to_period('M') logger.info("Finish Loading Data") stop = timeit.default_timer() logger.info(stop - start) conn.close() #------------------------------------------------------- #df2 = users.groupby('commute_mode').resample("M").count() return users, rides, matches #matches_day = matches.groupby(['community','ride_year','ride_month']) #mujeres = pd.read_csv("/Users/natisangarita/TryMyRide/mujeres.csv") #hombres = pd.read_csv("/Users/natisangarita/TryMyRide/hombres.csv") #select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc #airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])
#backing out tweet volume for i in range(len(df)): count = 0 start = 0 if i > 175: start = i - 175 for j in range(start,i): if df.loc[i,'elapsed'] - df.loc[j,'elapsed'] < .25 and df.loc[i,'elapsed'] - df.loc[j,'elapsed'] > 0: count += 1 vol_list.append(count) #finalizing df df['tweet_volume'] = vol_list df['audio_volume'] = df['tweet_volume']/df.tweet_volume.max() df = sqldf("SELECT * FROM df WHERE elapsed <= 90.1") #Plotting plt.style.use('ggplot') fig, ax = plt.subplots() #Creating a histogram of tweet volume ax.hist(df.elapsed, bins = 240,alpha=0.4,label='Tweets Sent',color='b') #A line plot of audio volume ax.plot(df.elapsed,df.audio_volume*100, color='g',label='Audio Volume (%)',alpha=.95) #Manual title ax.set(title='Tweet Activity/Audio Volume: Bremen v. Leverkusen') ax.set_xlabel('Minutes since start') ax.set_ylim(0, 230) ax.set_xlim(0, 91) ax.legend()
if pos == -1: pos = sqlfilter.find('from') if pos == -1: # SQL format not supported sqlfilter = '' if pos > -1: # found FROM, now find WHERE pos2 = sqlfilter.find('WHERE') if pos2 == -1: pos2 = sqlfilter.find('where') if pos2 == -1: # SQL format not supported sqlfilter = '' if pos2 > -1: # found FROM and WHERE, create new SQL query with netflowData as the table name sqlfilter = sqlfilter[:pos + 4] + ' netflowData ' + sqlfilter[pos2:] netflowData = pd.read_csv(fileName) # Reduce the NetFlow data frame using the SQL query if (sqlfilter != '') and ((sqlfilter[:6] == 'SELECT') or (sqlfilter[:6] == 'select')): print('Applying SQL filter') netflowData = pds.sqldf(sqlfilter, globals()) sns.set(style="white", color_codes=True) sns_plot = sns.jointplot(x=sys.argv[2], y=sys.argv[3], data=netflowData) sns_plot.fig.suptitle("Bivariate Distribution for " + fileName.split("\\")[-1], y=1.08) sns_plot.savefig("images/bivariate.svg")
cliente=pd.read_csv('cliente.csv') producto=pd.read_csv('producto.csv') tiempo=pd.read_csv('tiempo.csv') """En primera instancia vamos a descartar a las personas que ya tienen hipoteca. Pues dado que ya tienen una deuda buscaremos que la cubran para poder ofrecerles otro prestamo hipotecario""" q1="""SELECT cliente.cliente_id, cliente.nombre_cliente, cliente.ingreso, producto.tipo_producto_desc,producto.producto_id,producto.producto_desc, tiempo.fecha, hechos.saldo_$ as saldo,producto.numero_cuenta FROM hechos JOIN cliente ON cliente.llave_cliente == hechos.llave_cliente JOIN producto ON producto.llave_producto == hechos.llave_producto JOIN tiempo ON tiempo.llave_tiempo == hechos.llave_tiempo WHERE producto.tipo_producto_desc != "Mortgage" """ data = ps.sqldf(q1,locals()) #Cambiamos el tipo de dato a fecha para operar con esta columna. data['fecha'] = pd.to_datetime(data['fecha'], format='%d/%m/%Y') data.sort_values(by='fecha') #data.index = data['fecha'] """Dado que ya sabemos las personas que tienen prestamos hipotecarios, veremos el comportamiento de los saldos de las personas que aun son candidatas para ofrecerles un prestamo""" q2="""SELECT cliente.cliente_id, cliente.nombre_cliente, cliente.ingreso, producto.tipo_producto_desc,producto.producto_id,producto.producto_desc, tiempo.fecha, hechos.saldo_$ as saldo,producto.numero_cuenta FROM hechos JOIN cliente ON cliente.llave_cliente == hechos.llave_cliente JOIN producto ON producto.llave_producto == hechos.llave_producto JOIN tiempo ON tiempo.llave_tiempo == hechos.llave_tiempo
@author: elara """ import platform import pandas as pd from pandasql import sqldf import gensim import logging import multiprocessing import itertools import numpy as np cores = multiprocessing.cpu_count() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) pysqldf = lambda q: sqldf(q, globals()) if platform.system() == 'Linux': main_path = '/mnt/c/' if platform.system() == 'Windows': main_path = 'C:/' test_corpus_path = main_path+ 'Elara/Documents/paper/LDA/lda_test.csv' test_text = pd.read_csv(test_corpus_path,encoding='utf-8',engine='python',names = ['i','content']) i = list(set(test_text['i']))[0] texts =[i.split() for i in test_text.loc[test_text['i']==i]['content']] dictionary = gensim.corpora.Dictionary(texts) text_train = [dictionary.doc2bow(i) for i in texts] lda1 = gensim.models.ldamodel.LdaModel(corpus=text_train, num_topics=165, id2word=dictionary, distributed=False, chunksize=2000, passes=20
data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Quasi'] if model_type == "credit_batch": data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Batch'] if model_type == "debit_pin": data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Pin'] if model_type == "debit_sign": data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Signature'] data.to_csv(out_dir + model_type + '_' + "data.csv") pysql = lambda q: pdsql.sqldf(q, globals()) # Create overall dataframe q_f_all = ''' select MODEL_SCORE_NR, MODEL_SCORE_NR_SCALE, MODEL_SCORE_NR_SCALE_INV, MODEL_SCORE_NR_BIN, sum(NUM_TXNS_VALID) as SUM_NUM_TXNS_VALID, sum(NUM_TXNS_FRAUD) as SUM_NUM_TXNS_FRAUD, sum(NUM_TXNS) as SUM_NUM_TXNS, sum(SALES_VALID) as SUM_SALES_VALID, sum(SALES_FRAUD) as SUM_SALES_FRAUD, sum(SALES) as SUM_SALES, sum(APPROVED_TRANS_VALID) as SUM_APPROVED_TRANS_VALID,
diabetes_df.describe() diabetes_df.groupby("readmitted").size() #Find null or missing diabetes_df.isnull().sum() diabetes_df.isna().sum() #Exploratory analysis time # How many encounters by patient q="""Select patient_nbr, count(distinct encounter_id) as encounters from diabetes_df group by 1 order by 2 desc""" patient_encounters = sqldf(q) q="""Select AVG(encounters) as average_encounters from (Select patient_nbr, count(distinct encounter_id) as encounters from diabetes_df group by 1 order by 2 desc)""" avg_patient_encounters = sqldf(q) #Mean figures medications etc q = """Select AVG(num_procedures) as mean_num_procedures, AVG(num_medications) as mean_num_medications, AVG(num_lab_procedures) as mean_num_lab_procedures, AVG(time_in_hospital) as average_time_in_hospital,
def state_names(): q="SELECT DISTINCT State FROM df" pysqldf = lambda q: sqldf(q, globals()) a_df = pysqldf(q) return jsonify(list(a_df.State))
def Total_holdings(primary_key, secret_key): if (KeyFormModel): key = primary_key secret = secret_key # python3 secret_bytes = bytes(secret, encoding='utf-8') # python2 #secret_bytes = bytes(secret) # Generating a timestamp. timeStamp = int(round(time.time() * 1000)) body = {"timestamp": timeStamp} json_body = json.dumps(body, separators=(',', ':')) signature = hmac.new(secret_bytes, json_body.encode(), hashlib.sha256).hexdigest() url = "https://api.coindcx.com/exchange/v1/users/balances" headers = { 'Content-Type': 'application/json', 'X-AUTH-APIKEY': key, 'X-AUTH-SIGNATURE': signature } response = requests.post(url, data=json_body, headers=headers) data = response.json() curr_list = [] for i in data: if (i["balance"] != '0.0'): curr_list.append(i["currency"]) # Enter your API Key and Secret here. If you don't have one, you can generate it from the website. # python3 secret_bytes = bytes(secret, encoding='utf-8') # Generating a timestamp timeStamp = int(round(time.time() * 1000)) body = {"timestamp": timeStamp} json_body = json.dumps(body, separators=(',', ':')) signature = hmac.new(secret_bytes, json_body.encode(), hashlib.sha256).hexdigest() url = "https://api.coindcx.com/exchange/v1/users/balances" headers = { 'Content-Type': 'application/json', 'X-AUTH-APIKEY': key, 'X-AUTH-SIGNATURE': signature } currency = [] balance = [] locked_balance = [] response = requests.post(url, data=json_body, headers=headers) data = response.json() for curr in data: if (curr['balance'] != '0.0' and curr['currency'] not in ['ALGO', 'INR']): balance.append(curr['balance']) currency.append(curr['currency'] + 'INR') locked_balance.append(curr['locked_balance']) PORTFOLIO = pd.DataFrame({ 'CURRENCY': currency, 'BALANCE': balance, 'LOCKED_BALANCE': locked_balance }) url = "https://api.coindcx.com/exchange/ticker" response = requests.get(url) data = response.json() market = [] last_price = [] for currdetail in data: if (currdetail['market'] in currency): market.append(currdetail['market']) last_price.append(currdetail['last_price']) market_ticker = go.Figure(data=[ go.Table(header=dict(values=['market', 'last_price']), cells=dict(values=[market, last_price])) ]) LAST_PRICE_TABLE = pd.DataFrame({ 'MARKET': market, 'LAST_PRICE': last_price }) TOTAL = ps.sqldf(''' SELECT A.MARKET, A.LAST_PRICE, B.BALANCE as ACTIVE_BALANCE, B.LOCKED_BALANCE, (B.BALANCE+LOCKED_BALANCE) as TOTAL_BALANCE, (LAST_PRICE*(B.BALANCE+LOCKED_BALANCE)) as TOTAL_HOLDING FROM LAST_PRICE_TABLE A JOIN PORTFOLIO B ON A.MARKET=B.CURRENCY''' ) PIE = ps.sqldf(''' SELECT MARKET,TOTAL_HOLDING FROM TOTAL''') dfpair = {'total_df': TOTAL, 'pie_df': PIE} return (dfpair)
# Add headers and interpret the last column as a date, extract year of purchase data.columns = ['customer_id', 'purchase_amount', 'date_of_purchase'] data['date_of_purchase'] = pd.to_datetime(data.date_of_purchase) data['days_since'] = (pd.Timestamp('2016-01-01') - data['date_of_purchase']).dt.days # Display the data after transformation data.head() data.describe() # Compute key marketing indicators using SQL language # Compute recency, frequency, and average purchase amount customers = sqldf( "SELECT customer_id, MIN(days_since) AS 'recency', COUNT(*) AS 'frequency', AVG(purchase_amount) AS 'amount' FROM data GROUP BY 1", globals()) # Explore the data customers.head() customers.describe() customers.recency.hist(bins=20) customers.frequency.hist(bins=24) customers.amount.hist() customers.amount.hist(bins=99) # --- PREPARING AND TRANSFORMING DATA ---------------------- # Copy customer data into new data frame new_data = customers
import geoplotlib from pandasql import sqldf import pandas as pd df = pd.read_csv('data.csv') destco = pd.read_csv('destination_latlong.csv') orgco = pd.read_csv('origin_latlong.csv') df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], format='%Y-%m-%d') airline = df.set_index('FL_DATE') q = """ SELECT A.ORIGIN_CITY_NAME,A.DEST_CITY_NAME,DC.lat As dlat,DC.lon as dlon,OC.lat As olat,OC.lon as olon FROM airline as A join destco as DC on A.DEST_CITY_NAME=DC.DEST_CITY_NAME join orgco as OC on A.ORIGIN_CITY_NAME=OC.ORIGIN_CITY_NAME WHERE DEP_DELAY=0 AND ARR_DELAY=0; """ print 'working..' result = sqldf(q, locals()) geoplotlib.graph(result, src_lat='olat', src_lon='olon', dest_lat='dlat', dest_lon='dlon', color='rainbow', alpha=32, linewidth=2) geoplotlib.savefig('no delay')
def merge_and_filter_ind_disaster(days, var, min_obs_in_month, min_share_month): ######################################################################## # Loading interpolated measures according to the specified number of days # of interpolation file_name = "estimated_data/interpolated_D/int_ind_disaster_days_" + str( days) + ".csv" D_df = pd.read_csv(file_name) # Dealing with dates: D_df["date"] = pd.to_datetime(D_df["date"]) D_df["date_adj"] = D_df["date"] + pd.offsets.MonthEnd(0) D_df = D_df.drop("date", axis=1) ######################################################################## # Limiting to companies with at least 15 observations in a month in at least 80% # months in the sample from January 1996 to December 2017. def min_month_obs(x): return x[var].count() > min_obs_in_month D_filter_1 = D_df.groupby(["secid", "date_adj"]).filter(min_month_obs) D_mon_mean = D_filter_1.groupby(["secid", "date_adj"]).mean().reset_index() num_months = len(np.unique(D_mon_mean["date_adj"])) def min_sample_obs(x): return x[var].count() > num_months * min_share_month D_filter = D_mon_mean.groupby("secid").filter(min_sample_obs) ######################################################################## # Loading data on monthly return and linking data: ret_df = pd.read_csv("estimated_data/crsp_data/crsp_monthly_returns.csv") ret_df["MV"] = ret_df["prc"] * ret_df["shrout"] oclink = pd.read_csv( "estimated_data/crsp_data/optionmetrics_crsp_link.csv") # Getting the best link for each month end of D-clamp: oclink = oclink[oclink.score < 6] oclink["sdate"] = [str(int(x)) for x in oclink["sdate"]] oclink["sdate"] = pd.to_datetime(oclink["sdate"], format="%Y%m%d") oclink["edate"] = [str(int(x)) for x in oclink["edate"]] oclink["edate"] = pd.to_datetime(oclink["edate"], format="%Y%m%d") q1 = """ select d.*, s1.permno as permno_1, s2.permno as permno_2, s3.permno as permno_3, s4.permno as permno_4, s5.permno as permno_5 from D_filter as d left join ( select secid, permno, sdate, edate from oclink where score = 1 ) as s1 on d.secid = s1.secid and d.date_adj >= s1.sdate and d.date_adj <= s1.edate left join ( select secid, permno, sdate, edate from oclink where score = 2 ) as s2 on d.secid = s2.secid and d.date_adj >= s2.sdate and d.date_adj <= s2.edate left join ( select secid, permno, sdate, edate from oclink where score = 3 ) as s3 on d.secid = s3.secid and d.date_adj >= s3.sdate and d.date_adj <= s3.edate left join ( select secid, permno, sdate, edate from oclink where score = 4 ) as s4 on d.secid = s4.secid and d.date_adj >= s4.sdate and d.date_adj <= s4.edate left join ( select secid, permno, sdate, edate from oclink where score = 5 ) as s5 on d.secid = s5.secid and d.date_adj >= s5.sdate and d.date_adj <= s5.edate """ tmp = sqldf(q1, locals()) # Filtering and providing the best match: q2 = """ select *, COALESCE(permno_1, permno_2, permno_3, permno_4, permno_5) as permno from tmp """ disaster_ret_df = sqldf(q2, locals()) disaster_ret_df = disaster_ret_df.drop( ["permno_1", "permno_2", "permno_3", "permno_4", "permno_5"], axis=1) # Merging with returns next month: disaster_ret_df = disaster_ret_df.rename({"date_adj": "date"}, axis=1) disaster_ret_df["date"] = pd.to_datetime(disaster_ret_df["date"]) disaster_ret_df[ "month_lead"] = disaster_ret_df["date"] + pd.offsets.MonthEnd(1) disaster_ret_df = disaster_ret_df.drop("date", axis=1) ret_df["date"] = pd.to_datetime(ret_df["date"]) ret_df["date"] = ret_df["date"] + pd.offsets.MonthEnd(0) # Merging this month's disaster variable with next month's return on the stock disaster_ret_df = pd.merge(disaster_ret_df, ret_df[["date", "permno", "ret"]], left_on=["permno", "month_lead"], right_on=["permno", "date"], how="left") # Merging this month's disaster variable, next month's return on the stock # with this month's market value = |PRC|*SCHROUT disaster_ret_df = pd.merge(disaster_ret_df, ret_df[["date", "permno", "MV"]], left_on=["permno", "date"], right_on=["permno", "date"], how="left") return disaster_ret_df
#print(data) #print(data.describe()) #print(data[['Fare']].apply(np.mean)) #['Survived'],['Fare'],['Sex'] df = pd.DataFrame(data = rdata, columns=['Fare', 'Sex','Age','Survived']) #print(df.describe()) #print(df) #df['Age'] = df['Age'].fillna(df['Age'].mean()) # #print(df) df['Age'] = df['Age'].fillna(np.mean(df['Age'])) print(df) print(np.sum(df['Age'])) print(np.mean(df['Age'])) ''' aadhaar_data = pd.read_csv( "C:\\Users\\berag\\SampleProject\\DataScience\\aadhaar_data.csv") aadhaar_data.rename(columns=lambda x: x.replace(' ', '_').lower(), inplace=True) q = """ SELECT * FROM aadhaar_data LIMIT 50 -- SQL Comment """ #aadhaar_solution = pandasql.sqldf(q.lower(), locals()) aadhaar_solution = pandasql.sqldf(q, globals()) b = pd.DataFrame(data=aadhaar_solution) print(b)
def runBoostingRegressorWithSubstrings_and_Times(amount_of_runs, host_name, root_name, passw_root, database_name, query): total_true = 0 # the amount of correctly predicted pass/fail of the sum of both languages. total_prolog = 0 # the amount of correctly predicted pass/fail of prolog. total_haskell = 0 # the amount of correctly predicted pass/fail of haskell. total_avg_deviation = 0 # the sum of the average deviation of each run. total_avg_deviation_both = 0 length_prediction_list = 1 # the amount of predictions made each run. query_result = Database_Functions.query_database_dataframe( host_name, root_name, passw_root, database_name, query) # this is a dataframe with the needed data query_result, big_dict, time_dict = preprocessing_2(query_result) query_result = pandasql.sqldf(Queries.get_query_09_1819_df("query_result"), locals()) grades = query_result[['user_id', 'score_prolog', 'score_haskell']].drop_duplicates(subset='user_id') # this is a dataframe with all user_id's and all scores grades.reset_index( drop=True, inplace=True ) # we reset the number index of the dataframe (purely cosmetics) possible_categories = query_result.query( 'language==1')['category'].unique() # gras = query result + Time Dict. query_result = integrate_times_into_df(time_dict, query_result) # selecting only prolog as cat # possible_categories = query_result['category'].unique() # preprocessing(host_name, root_name, passw_root, database_name, Queries.get_query_06_) big_result_list = [] for x in range( amount_of_runs): # in this loop the experiment gets repeated print("run number " + str(x)) verification_df = grades.sample( frac=0.1) # this is a random selection of 10% of the dataframe train_df = grades.drop( verification_df.index ) # we drop the sample that we have selected to retain 90% to train training_users = set(train_df['user_id'].tolist() ) # a set of all selected training-users verification_users = set(verification_df['user_id'].tolist()) relevant_subset, total_freq_subset = get_relevant_subset( training_users, big_dict) trees, frequency_list_df_training = TreeConstructor.create_trees_with_subsets( train_df, relevant_subset, total_freq_subset) data_points_training_df = query_result.iloc[np.where( query_result.user_id.isin(training_users))] # we have one boosting trees per category from create_trees_with_subsets, we now predict one score per # user and append this to the dataframe. data_points_training_df = add_freq_predictions_to_df( trees, data_points_training_df, frequency_list_df_training) frequency_list_df_ver = make_frequency_list_df(big_dict, verification_users, total_freq_subset) # A dataframe of all submissions of the selected users. data_points_verification_df = query_result.drop( data_points_training_df.index) # we drop the selected training data to form the verification data data_points_verification_df = add_freq_predictions_to_df( trees, data_points_verification_df, frequency_list_df_ver) my_boosting_trees = TreeConstructor.build_big_boostingtree_with_dataframe( data_points_training_df, possible_categories) # this function returns a dictionary containing the trained decision-trees having the categories as key. predicted_list, actual_verification = TreeConstructor.make_boosting_predictions_with_grades_in_df( my_boosting_trees, data_points_verification_df, possible_categories) # this function returns two lists containing lists of grades in float. Predictions and Actual grades to compare # for x in range(len(predicted_list)): # print(predicted_list[x][0]) # print(actual_verification[x]) pass_fail_result = pass_fail_boosting2(predicted_list, actual_verification) # here we calculate all data we need deviation = average_deviation_boosting2(predicted_list, actual_verification) total_avg_deviation += deviation[0] total_avg_deviation_both += deviation[1] total_true += sum([x[1] for x in pass_fail_result]) total_prolog += sum([x[0][0] for x in pass_fail_result]) total_haskell += sum([x[0][1] for x in pass_fail_result]) # # we add all the parameters because at the end we will divide it by the total amount of runs if length_prediction_list != len(pass_fail_result): length_prediction_list = len(pass_fail_result) big_result_list += [ predicted_list[x][0].tolist() + actual_verification[x] for x in range(len(predicted_list)) ] df = DataFrame(big_result_list, columns=[ "Predicted Prolog", "Predicted Haskell", "Actual Prolog", "Actual Haskell" ]) return [ total_true / amount_of_runs, total_prolog / amount_of_runs, total_haskell / amount_of_runs, total_avg_deviation / amount_of_runs, length_prediction_list, total_avg_deviation_both / amount_of_runs, df ]