def min_temperature_on_rainy_days(filename):
    '''
    This function should run a SQL query on a dataframe of
    weather data. More specifically you want to find the average
    minimum temperature on rainy days where the minimum temperature
    is greater than 55 degrees.
    
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be equal to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    
    You can see the weather data that we are passing in below:
    https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv
    '''

    weather_data = pandas.read_csv(filename)

    q = """
    select avg(mintempi) from weather_data where cast(rain as integer) = 1 and cast(mintempi as integer) > 55
    """

    #Execute your SQL command against the pandas frame
    avg_min_temp_rainy = pandasql.sqldf(q.lower(), locals())
    return avg_min_temp_rainy
    
    #Execute your SQL command against the pandas frame
    mean_temp_weekends = pandasql.sqldf(q.lower(), locals())
    return mean_temp_weekends
Ejemplo n.º 2
0
def test_noleak_legacy(db_uri):
    df = pd.DataFrame({'x': [1]})
    result = sqldf("SELECT * FROM df", db_uri=db_uri)
    pdtest.assert_frame_equal(df, result)
    del df
    with pytest.raises(PandaSQLException):
        result = sqldf("SELECT * FROM df", db_uri=db_uri)
Ejemplo n.º 3
0
    def test_query_with_spacing(self):

        df = pd.DataFrame({
            "letter_pos": [i for i in range(len(string.letters))],
            "l2": list(string.letters)
        })

        df2 = pd.DataFrame({
            "letter_pos": [i for i in range(len(string.letters))],
            "letter": list(string.letters)
        })
        
        result = sqldf("SELECT a.*, b.letter FROM df a INNER JOIN df2 b ON a.l2 = b.letter LIMIT 20;", locals())
        self.assertEquals(len(result), 20)

        q = """
            SELECT
            a.*
        FROM
            df a
        INNER JOIN
            df2 b
        on a.l2 = b.letter
        LIMIT 20
        ;"""
        result = sqldf(q, locals())
        self.assertEquals(len(result), 20)
Ejemplo n.º 4
0
 def count(self):
     eventsdb = self.eventsdb
     eventsdb_sorted = sqldf('SELECT * from eventsdb ORDER BY id',locals())
     numevents = len(eventsdb)
     count = [i for i in range(0,numevents)]
     eventsdb_sorted['count'] = count
     self.eventsdb = sqldf('SELECT * from eventsdb_sorted ORDER BY id',locals())
     self.eventsdb_subset = self.eventsdb
Ejemplo n.º 5
0
 def survival_probability(self):
     eventsdb = self.eventsdb
     eventsdb_sorted = sqldf('SELECT * from eventsdb ORDER BY duration_us',locals())
     numevents = len(eventsdb)
     survival = [1.0 - float(i)/float(numevents) for i in range(0,numevents)]
     eventsdb_sorted['survival_probability'] = survival
     self.eventsdb = sqldf('SELECT * from eventsdb_sorted ORDER BY id',locals())
     self.eventsdb_subset = self.eventsdb
Ejemplo n.º 6
0
 def delay_probability(self):
     eventsdb = self.eventsdb
     eventsdb_sorted = sqldf('SELECT * from eventsdb ORDER BY event_delay_s',locals())
     numevents = len(eventsdb)
     delay = [1.0 - float(i)/float(numevents) for i in range(0,numevents)]
     eventsdb_sorted['delay_probability'] = delay
     self.eventsdb = sqldf('SELECT * from eventsdb_sorted ORDER BY id',locals())
     self.eventsdb_subset = self.eventsdb
def entries_time_period_bar(turnstile_weather):

    # add new fields
    turnstile_weather['dayofweek'] = pd.DatetimeIndex(turnstile_weather['DATEn']).dayofweek
    turnstile_weather['day'] = pd.DatetimeIndex(turnstile_weather['DATEn']).day

    turnstile_weather['weekday'] = 0
    turnstile_weather.loc[turnstile_weather['dayofweek'] <= 4, 'weekday'] = 1

    q = """
    select *
    ,case
    when Hour <=5 then 1
    when weekday = 0 and Hour >= 6 then 2
    when weekday = 1 and Hour between 10 and 14 then 3
    when weekday = 1 and Hour >= 20 then 4
    when weekday = 1 and Hour between 6 and 9 then 5
    when weekday = 1 and Hour between 15 and 19 then 5
    else 0 end as timeperiod
    from turnstile_weather
    """
    
    #Execute your SQL command against the pandas frame
    turnstile_weather2 = pandasql.sqldf(q.lower(), locals())


    q2 = """
    select time_period,sum(ENTRIESn_hourly) as ENTRIESn_hourly
    from turnstile_weather2
    group by time_period
    """
    
    #Execute your SQL command against the pandas frame
    entries_time_period = pandasql.sqldf(q2.lower(), locals())

    plt.figure()
    plt.title('Turnstile Entries by Time Period')
    plt.ylabel('Turnstile Entries (in millions)')
    plt.xlabel('Time Period')

    y = entries_time_period['ENTRIESn_hourly']/1000000
    x = entries_time_period['time_period']
    labels = ['Late Night','Weekends','Midday','Evening','Rush Hour']

    plt.xticks(x, labels)
    plt.xlim(0,6)
    plt.ylim(0,25)
    plt.bar(x, y,width=0.25,align='center',color='DodgerBlue')

    #print entries_time_period

    return plt
def compare_averages(filename):
    """
    Performs a t-test on two sets of baseball data (left-handed and right-handed hitters).

    You will be given a csv file that has three columns.  A player's
    name, handedness (L for lefthanded or R for righthanded) and their
    career batting average (called 'avg'). You can look at the csv
    file via the following link:
    https://www.dropbox.com/s/xcn0u2uxm8c4n6l/baseball_data.csv

    Write a function that will read that the csv file into a pandas data frame,
    and run Welch's t-test on the two cohorts defined by handedness.

    One cohort should be a data frame of right-handed batters. And the other
    cohort should be a data frame of left-handed batters.

    We have included the scipy.stats library to help you write
    or implement Welch's t-test:
    http://docs.scipy.org/doc/scipy/reference/stats.html

    With a significance level of 95%, if there is no difference
    between the two cohorts, return a tuple consisting of
    True, and then the tuple returned by scipy.stats.ttest.

    If there is a difference, return a tuple consisting of
    False, and then the tuple returned by scipy.stats.ttest.

    For example, the tuple that you return may look like:
    (True, (9.93570222, 0.000023))
    """
    theframe=pandas.read_csv(filename)
    turnstile_weather=theframe
    q="select * from theframe where rain = 1"
    #wasraining=theframe[theframe['rain']==1]
    wasraining=pandasql.sqldf(q.lower(), locals())
    r ="select * from theframe where rain = 0"
    wasnotraining=pandasql.sqldf(r.lower(), locals())
    #wasraining=theframe[['ENTRIESn_hourly']][theframe['rain']==1]
    #numpy.nan_to_num(wasraining)

    wasnotraining=theframe[['ENTRIESn_hourly']][theframe['rain']==0]
    #numpy.nan_to_num(wasnotraining)
    result=scipy.stats.mannwhitneyu(turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==1], turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==0])
    #result=scipy.stats.mannwhitneyu(wasraining['ENTRIESn_hourly'],wasnotraining['ENTRIESn_hourly'])
    #result=scipy.stats.ttest_ind(wasraining['ENTRIESn_hourly'],wasnotraining['ENTRIESn_hourly'],equal_var=False)    
    print len(wasraining.index)
    print len(wasnotraining.index)
    
    
    return result
def nentries_mean(df,col_name):
    """
    mean of the total number of entries for all units for each hour or day
    """    

    # sum of the number of entries per hour for all units
    q = 'SELECT ' + col_name + ', daten, SUM(entriesperhr) AS totentries FROM df GROUP BY ' + col_name + ', daten;'
    df = pandasql.sqldf(q.lower(), locals())
    
    #mean of the total number of entries for all unit
    q = 'SELECT ' + col_name + ', AVG(totentries) AS avgentries FROM df GROUP BY ' + col_name + ';'
    df = pandasql.sqldf(q.lower(), locals())
    
    return df
Ejemplo n.º 10
0
def num_rainy_days(filename):
    '''
    Function returning various sql statements regarding this weather data:
    https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv
    '''
    weather_data = pandas.read_csv(filename)
    
    rain_count = """
    SELECT count(rain) FROM weather_data WHERE rain = 1;
    """
    maxTemp_fog = """
    SELECT fog, max(maxtempi) FROM weather_data WHERE fog = 0
    UNION
    SELECT fog, max(maxtempi) FROM weather_data WHERE fog = 1;
    """
    meanTemp_weekend = """
    select avg(cast (meantempi as integer)) from weather_data
    where cast (strftime('%w', date) as integer) in (0, 6);
    """
    avgMinTemp_rain = """
    select avg(mintempi) from weather_data where rain = 1 and mintempi > 55;
    """
    
    #Execute one of the SQL command against the pandas frame
    rainy_count = pandasql.sqldf(q1.lower(), locals())
    return rainy_count
def entries_day_line(turnstile_weather):

    # Add Time Period 
    turnstile_weather = time_period(turnstile_weather)

    q = """
    select day_week as Day_Week,sum(ENTRIESn_hourly) as ENTRIESn_hourly
    from turnstile_weather
    group by day_week
    """
    
    #Execute your SQL command against the pandas frame
    entries_day = pandasql.sqldf(q, locals())

    plt.figure()
    plt.title('Turnstile Entries by Day (5/1/2011 - 5/31/2011)')
    plt.ylabel('Turnstile Entries (in millions)')
    plt.xlabel('Day of Week')

    y = entries_day['ENTRIESn_hourly']/1000000
    x = entries_day['Day_Week']
    labels = ['Mon','Tue','Wed','Thur','Fri', 'Sat', 'Sun']

    plt.xticks(x, labels)
    plt.xlim(-1,7)
    plt.ylim(0,25)
    plt.plot(x,y,marker='.',linestyle='--')

    #print entries_day

    return plt
Ejemplo n.º 12
0
def aggregate_query(filename):
    aadhaar_data = pandas.read_csv(filename)
    aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(),
                        inplace=True)

    # Write a query that will select from the aadhaar_data table how
    # many men and how many women over the age of 50 have had
    # aadhaar generated for them in each district
    #
    # The possible columns to select from aadhaar data are:
    #     1) Registrar
    #     2) Enrolment Agency
    #     3) State
    #     4) District
    #     5) Sub District
    #     6) Pin Code
    #     7) Gender
    #     8) Age
    #     9) Aadhaar generated
    #     10) Enrolment Rejected
    #     11) Residents providing email,
    #     12) Residents providing mobile number
    #
    # You can download a copy of the aadhaar data that we are passing
    # into this exercise below:
    # https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv

    q = """
    select gender, district, sum(aadhaar_generated) from aadhaar_data where age > 50 group by gender, district
    """

    aadhaar_solution = pandasql.sqldf(q.lower(), locals())
    return aadhaar_solution
Ejemplo n.º 13
0
def avg_min_temperature(filename):
    weather_data = pandas.read_csv(filename)
    q = """
    SELECT avg(mintempi) FROM weather_data WHERE mintempi > 55 AND rain = 1;
    """
    avg_min_temp_rainy = pandasql.sqldf(q.lower(), locals())
    return avg_min_temp_rainy
def select_first_50(filename):
    # Read in our aadhaar_data csv to a pandas dataframe.  Afterwards, we rename the columns
    # by replacing spaces with underscores and setting all characters to lowercase, so the
    # column names more closely resemble columns names one might find in a table.
    aadhaar_data = pandas.read_csv(filename)
    aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)

    # Select out the first 50 values for "registrar" and "enrolment_agency"
    # in the aadhaar_data table using SQL syntax. 
    #
    # Note that "enrolment_agency" is spelled with one l. Also, the order
    # of the select does matter. Make sure you select registrar then enrolment agency
    # in your query.
    q = """
    -- YOUR QUERY HERE
    SELECT
    registrar, enrolment_agency
    FROM
    aadhaar_data
    LIMIT 50;
    """

    #Execute your SQL command against the pandas frame
    aadhaar_solution = pandasql.sqldf(q.lower(), locals())
    return aadhaar_solution    
Ejemplo n.º 15
0
def get_turnstile_samples(filename):
    df = pandas.read_csv(filename)
    query = """SELECT DATEn, sum(ENTRIESn_hourly) as entries, rain
               FROM df
               GROUP BY DATEn"""
    samples = pandasql.sqldf(query.lower(), locals())
    return samples
Ejemplo n.º 16
0
def max_temp_aggregate_by_fog(filename):
    weather_data = pandas.read_csv(filename)
    q = """
    SELECT fog, MAX(maxtempi) FROM weather_data GROUP BY fog;
    """
    foggy_days = pandasql.sqldf(q.lower(), locals())
    return foggy_days
def avg_min_temperature(filename):
    '''
    This function should run a SQL query on a dataframe of
    weather data. More specifically you want to find the average
    minimum temperature (mintempi column of the weather dataframe) on 
    rainy days where the minimum temperature is greater than 55 degrees.
    
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be useful to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    
    You can see the weather data that we are passing in below:
    https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv
    '''
    weather_data = pandas.read_csv(filename)

    q = """
        SELECT AVG(mintempi)
        FROM weather_data
        WHERE rain = 1 AND cast(mintempi as integer) > 55 
    """
    
    #Execute your SQL command against the pandas frame
    avg_min_temp_rainy = pandasql.sqldf(q.lower(), locals())
    return avg_min_temp_rainy
Ejemplo n.º 18
0
def num_rainy_days(filename):
    '''
    This function should run a SQL query on a dataframe of
    weather data.  The SQL query should return one column and
    one row - a count of the number of days in the dataframe where
    the rain column is equal to 1 (i.e., the number of days it
    rained).  The dataframe will be titled 'weather_data'. You'll
    need to provide the SQL query.  You might find SQL's count function
    useful for this exercise.  You can read more about it here:
    
    https://dev.mysql.com/doc/refman/5.1/en/counting-rows.html
    
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be useful to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    
    You can see the weather data that we are passing in below:
    https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv
    '''
    weather_data = pandas.read_csv("/Users/Elaine/Documents/Intro-to-Data-Science/core/weather_underground.csv")
    
    q = """
        SELECT Count(date)
        FROM weather_data
        WHERE rain == 1
    """
    
    #Execute your SQL command against the pandas frame
    rainy_days = pandasql.sqldf(q.lower(), locals())
    return rainy_days
Ejemplo n.º 19
0
def filter_by_regular(filename):
    '''
    This function should read the csv file located at filename into a pandas dataframe,
    and filter the dataframe to only rows where the 'DESCn' column has the value 'REGULAR'.
    
    For example, if the pandas dataframe is as follows:
    ,C/A,UNIT,SCP,DATEn,TIMEn,DESCn,ENTRIESn,EXITSn
    0,A002,R051,02-00-00,05-01-11,00:00:00,REGULAR,3144312,1088151
    1,A002,R051,02-00-00,05-01-11,04:00:00,DOOR,3144335,1088159
    2,A002,R051,02-00-00,05-01-11,08:00:00,REGULAR,3144353,1088177
    3,A002,R051,02-00-00,05-01-11,12:00:00,DOOR,3144424,1088231
    
    The dataframe will look like below after filtering to only rows where DESCn column
    has the value 'REGULAR':
    0,A002,R051,02-00-00,05-01-11,00:00:00,REGULAR,3144312,1088151
    2,A002,R051,02-00-00,05-01-11,08:00:00,REGULAR,3144353,1088177
    '''
    turnstile_data = pandas.read_csv(filename)
    q = """
    select *
    from turnstile_data
    where DESCn like 'REGULAR'
    """
    #Execute your SQL command against the pandas frame
    turnstile_data_results = pandasql.sqldf(q.lower(), locals())
    #print turnstile_data_results
    
    return turnstile_data_results
Ejemplo n.º 20
0
def num_rainy_days(filename):
    weather_data = pandas.read_csv(filename, na_values='')
    q = '''
    SELECT count(rain) AS weather_data from weather_data where rain = 1;
    '''
    rainy_days = pandasql.sqldf(q.lower(), locals())
    return rainy_days
Ejemplo n.º 21
0
def max_temp_aggregate_by_fog(filename):
    """
    This function should run a SQL query on a dataframe of
    weather data.  The SQL query should return two columns and
    two rows - whether it was foggy or not (0 or 1) and the max
    maxtempi for that fog value (i.e., the maximum max temperature
    for both foggy and non-foggy days).  The dataframe will be 
    titled 'weather_data'. You'll need to provide the SQL query.
    
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be useful to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    
    You can see the weather data that we are passing in below:
    https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv
    """
    weather_data = pandas.read_csv(filename)

    q = "SELECT fog, MAX(cast (maxtempi as integer)) FROM weather_data WHERE cast(fog as integer) = 1 UNION SELECT fog, MAX(cast (maxtempi as integer)) FROM weather_data WHERE cast(fog as integer) = 0"

    # Execute your SQL command against the pandas frame
    rainy_days = pandasql.sqldf(q.lower(), locals())
    return rainy_days
def num_rainy_days(filename):
    '''
    This function should run a SQL query on a dataframe of
    weather data.  The SQL query should return one column and
    one row - a count of the number of days in the dataframe where
    the rain column is equal to 1 (i.e., the number of days it
    rained).  The dataframe will be titled 'weather_data'. You'll
    need to provide the SQL query.  You might find SQL's count function
    useful for this exercise.  You can read more about it here:
    
    https://dev.mysql.com/doc/refman/5.1/en/counting-rows.html
    
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be useful to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    
    You can see the weather data that we are passing in below:
    https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv
    '''
    weather_data = pandas.read_csv(filename)

    q = """
    select count(*) from weather_data where rain = 1
    """
    
    #Execute your SQL command against the pandas frame
    rainy_days = pandasql.sqldf(q.lower(), locals())
    return rainy_days
def avg_weekend_temperature(filename):
    '''
    The SQL query should return one column and
    one row - the average meantempi on days that are a Saturday
    or Sunday (i.e., the the average mean temperature on weekends).
    The dataframe will be titled 'weather_data' and you can access
    the date in the dataframe via the 'date' column.
    
    Also, you can convert dates to days of the week via the 'strftime' keyword in SQL.
    For example, cast (strftime('%w', date) as integer) will return 0 if the date
    is a Sunday or 6 if the date is a Saturday.
    '''
    weather_data = pandas.read_csv(filename)
    #print weather_data.columns.values
    
    q = """
    SELECT
    avg(cast(meantempi as integer))
    FROM
    weather_data
    WHERE
    cast(strftime('%w', date) as integrer) == 0 or cast(strftime('%w', date) as integer) == 6
    """
    
    #Execute your SQL command against the pandas frame
    mean_temp_weekends = pandasql.sqldf(q.lower(), locals())
    return mean_temp_weekends
def aggregate_query(filename):
    # Read in our aadhaar_data csv to a pandas dataframe.  Afterwards, we rename the columns
    # by replacing spaces with underscores and setting all characters to lowercase, so the
    # column names more closely resemble columns names one might find in a table.
    
    aadhaar_data = pandas.read_csv(filename)
    aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)

    # Write a query that will select from the aadhaar_data table how many men and how 
    # many women over the age of 50 have had aadhaar generated for them in each district.
    # aadhaar_generated is a column in the Aadhaar Data that denotes the number who have had
    # aadhaar generated in each row of the table.
    #   
    q = """
    SELECT
    gender, district, sum(aadhaar_generated)
    FROM
    aadhaar_data
    WHERE
    age > 50
    GROUP BY
    gender, district
    """
    # Execute your SQL command against the pandas frame
    aadhaar_solution = pandasql.sqldf(q.lower(), locals())
    return aadhaar_solution
def select_first_50(filename):
    # Read in our aadhaar_data csv to a pandas dataframe.  Afterwards, we rename the columns
    # by replacing spaces with underscores and setting all characters to lowercase, so the
    # column names more closely resemble columns names one might find in a table.
    aadhaar_data = pandas.read_csv(filename)
    aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)

    # Select out the first 50 values for "registrar" and "enrolment_agency"
    # in the aadhaar_data table using SQL syntax. 
    #
    # Note that "enrolment_agency" is spelled with one l. Also, the order
    # of the select does matter. Make sure you select registrar then enrolment agency
    # in your query.
    #
    # You can download a copy of the aadhaar data that we are passing 
    # into this exercise below:
    # https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv
    q = """
    select "registrar", "enrolment_agency" from aadhaar_data
    limit 50;
    """

    #Execute your SQL command against the pandas frame
    aadhaar_solution = pandasql.sqldf(q.lower(), locals())
    return aadhaar_solution   
def avg_weekend_temperature(filename):
    '''
    This function should run a SQL query on a dataframe of
    weather data.  The SQL query should return one column and
    one row - the average meantempi on days that are a Saturday
    or Sunday (i.e., the the average mean temperature on weekends).
    The dataframe will be titled 'weather_data' and you can access
    the date in the dataframe via the 'date' column.
    
    You'll need to provide  the SQL query.
    
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be useful to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    
    Also, you can convert dates to days of the week via the 'strftime' keyword in SQL.
    For example, cast (strftime('%w', date) as integer) will return 0 if the date
    is a Sunday or 6 if the date is a Saturday.
    
    You can see the weather data that we are passing in below:
    https://www.dropbox.com/s/7sf0yqc9ykpq3w8/weather_underground.csv
    '''
    weather_data = pandas.read_csv(filename)

    q = """
    select avg(cast(meantempi as integer)) from weather_data where cast (strftime('%w', date) as integer) = 0 or cast(strftime('%w', date) as integer) = 6
    """
    
    #Execute your SQL command against the pandas frame
    mean_temp_weekends = pandasql.sqldf(q.lower(), locals())
    return mean_temp_weekends
def num_rainy_days(filename):
    '''
    This function should run a SQL query on a dataframe of
    weather data.  The SQL query should return one column and
    one row - a count of the number of days in the dataframe where
    the rain column is equal to 1 (i.e., the number of days it
    rained).
                https://dev.mysql.com/doc/refman/5.1/en/counting-rows.html
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be useful to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    '''
    weather_data = pandas.read_csv(filename)
    #print weather_data.head()
    #print weather_data.columns.values
    #print numpy.sum(weather_data['rain'] > 0)
    q = """
    SELECT
    COUNT(*)
    FROM
    weather_data
    WHERE
    rain == 1
    """
    #Execute your SQL command against the pandas frame
    rainy_days = pandasql.sqldf(q.lower(), locals())
    return rainy_days
def max_temp_aggregate_by_fog(filename):
	'''
	This function should run a SQL query on a dataframe of
	weather data.  The SQL query should return two columns and
	two rows - whether it was foggy or not (0 or 1) and the max
	maxtempi for that fog value (i.e., the maximum max temperature
	for both foggy and non-foggy days).  The dataframe will be
	titled 'weather_data'. You'll need to provide the SQL query.

	You might also find that interpreting numbers as integers or floats may not
	work initially.  In order to get around this issue, it may be useful to cast
	these numbers as integers.  This can be done by writing cast(column as integer).
	So for example, if we wanted to cast the maxtempi column as an integer, we would actually
	write something like where cast(maxtempi as integer) = 76, as opposed to simply
	where maxtempi = 76.

	You can see the weather data that we are passing in below:
	https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv
	'''
	weather_data = pandas.read_csv(filename)

	q = """
	SELECT fog, max(maxtempi)
	FROM weather_data
	GROUP BY fog;
	"""

	#Execute your SQL command against the pandas frame
	foggy_days = pandasql.sqldf(q.lower(), locals())
	return foggy_days
Ejemplo n.º 29
0
 def test_select(self):
     df = pd.DataFrame({
              "letter_pos": [i for i in range(len(string.letters))],
              "l2": list(string.letters)
     })
     result = sqldf("select * from df LIMIT 10;", locals())
     self.assertEquals(len(result), 10)
Ejemplo n.º 30
0
def entries_histogram(df):   
    # Sum entries by date and UNIT
    global daily_entries
    daily_entries = (df[['DATEn','UNIT','ENTRIESn_hourly']].
                        groupby(['DATEn','UNIT']).sum())
    daily_entries = daily_entries.reset_index()
    # Group rain by date
    global daily_rain
    daily_rain = df[['DATEn','rain']].groupby('DATEn').mean()
    daily_rain = daily_rain.reset_index()
    daily_rain.loc[:,'rain'] = daily_rain['rain'].apply(lambda x: reduce(x))
    # Join daily_entries and daily_rain tables on date
    from pandasql import sqldf
    pysqldf = lambda q: sqldf(q, globals())
    q = ('''SELECT e.DATEn, e.UNIT, e.ENTRIESn_hourly, p.rain FROM 
    daily_entries e JOIN daily_rain p ON e.DATEn = p.DATEn;''')
    daily_entries = pysqldf(q)
    # Divide daily_entries into rain and no-rain tables
    daily_entries.loc[:, 'entries_log'] = (daily_entries['ENTRIESn_hourly'].
                                            apply(lambda x: take_log(x)))
    no_rain = daily_entries[daily_entries.rain==0]
    rain = daily_entries[daily_entries.rain==1]
    x = [no_rain['entries_log'], rain['entries_log']]
    # plot histogram
    plt.hist(x, range = (0, 16), bins = 23, color=['k','m'], 
             label=["no rain","rain"])
    plt.xlabel("log of ENTRIESn_hourly summed by date and remote unit")
    plt.ylabel("Frequency")
    legend = plt.legend()
    return plt
from   sklearn                 import decomposition, preprocessing, cluster, metrics
from   sklearn.linear_model    import LinearRegression, LogisticRegression
from   sklearn.metrics         import confusion_matrix, roc_curve, auc
from   sklearn.model_selection import train_test_split

import six
from   math              import *

# /////  Pour gérer un affichage plus joli que la fonction "print"  //////
from IPython.display     import display, Markdown, HTML, display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

# /////  Pour executer des requetes SQL de verification sur des DF  /////
from pandasql            import sqldf
execsql = lambda q: sqldf(q, globals())   
# EXEMPLE D'UTILISATION
# ----------------------
# req1 = ''' Select zone1, zone2 From DataFrame Where zone3=xx and zone4='xx' limit 3;'''
# df1 = execsql(req1)
# df1

import time

# Limiter le nombre de lignes d'affichage des Dataframes
pd.options.display.max_rows=20
def main():

    #yelp_integrated_frame = integrate_restaurants_business()
    yelp_integrated_frame = fetch_biz_data()
    inspection_data = fetch_inspection_data()
    inspection_data = inspection_data[inspection_data['Results'].str.lower().str.contains('fail')]
    yelp_integrated_frame['DATE_ISSUED'] = pd.to_datetime(yelp_integrated_frame['DATE_ISSUED'], format='%m/%d/%Y')
    yelp_integrated_frame['LICENSE_STATUS_CHANGE_DATE'] = pd.to_datetime(yelp_integrated_frame['LICENSE_STATUS_CHANGE_DATE'], format='%m/%d/%Y')
    inspection_data['Inspection_Date'] = pd.to_datetime(inspection_data['Inspection_Date'], format='%m/%d/%Y')


    inspection_data_test= inspection_data.groupby(['DOING_BUSINESS_AS_NAME', 'ADDRESS'])['Inspection_Date'].max().reset_index()

    yelp_int_test = yelp_integrated_frame.groupby(['DOING_BUSINESS_AS_NAME', 'ADDRESS'])['DATE_ISSUED'].max().reset_index()

    integ = pd.merge(inspection_data_test,yelp_int_test, on=['DOING_BUSINESS_AS_NAME'])

    #integ = integ.sort_values('DOING_BUSINESS_AS_NAME')

    #pysqldf = lambda q: sqldf(q, globals())

    new_df = pd.DataFrame()

    for dummy,index in integ.iterrows():
        add_x = index['ADDRESS_x']
        add_y = index['ADDRESS_y']
        measure = similar(add_x, add_y)
        if measure >= 0.8 and index['Inspection_Date'].year <= 2014:
            index["diff"] = (index['Inspection_Date'] - index['DATE_ISSUED']).days
            new_df = new_df.append(index)




    new_df = new_df[new_df['diff'] > 750]
    new_df = new_df[['DOING_BUSINESS_AS_NAME','ADDRESS_x','Inspection_Date', 'diff']]
    new_df = new_df.groupby(['DOING_BUSINESS_AS_NAME','ADDRESS_x'])['Inspection_Date','diff'].max().reset_index()



    q = """SELECT * FROM integ WHERE integ.ADDRESS_x LIKE '%' || integ.ADDRESS_y || '%'"""

    dff = psql.sqldf(q, locals())

    first_issued_lic_data = yelp_integrated_frame.groupby("DOING_BUSINESS_AS_NAME", as_index=False)["DATE_ISSUED"].max()
    yelp_integrated_frame = pd.merge(yelp_integrated_frame,first_issued_lic_data, on=['DOING_BUSINESS_AS_NAME','DOING_BUSINESS_AS_NAME',
                                                                                      'DATE_ISSUED','DATE_ISSUED',])
    yelp_integrated_frame = yelp_integrated_frame.drop_duplicates()

    yelp_integrated_frame = yelp_integrated_frame[yelp_integrated_frame['LICENSE_STATUS'].str.lower().str.contains('aac|rev')]

    merged = pd.merge(inspection_data,yelp_integrated_frame, on=['LICENSE_ID'])
    merged['diff'] = 0

    for dummy,index in merged.iterrows():
        merged.at[dummy,"diff"] = (index['LICENSE_STATUS_CHANGE_DATE'] - index['DATE_ISSUED']).days


    merged = merged[['DOING_BUSINESS_AS_NAME_x','ADDRESS_x','Inspection_Date','diff']]
    merged = merged.groupby(['DOING_BUSINESS_AS_NAME_x','ADDRESS_x'])['Inspection_Date','diff'].max().reset_index()

    merged = merged.rename(columns={'DOING_BUSINESS_AS_NAME_x': 'DOING_BUSINESS_AS_NAME'})


    final_merge = pd.concat([new_df, merged], ignore_index=True)
    final_merge = final_merge.groupby(['DOING_BUSINESS_AS_NAME','ADDRESS_x'])['Inspection_Date','diff'].max().reset_index()
    final_merge['diff'] = final_merge['diff'].apply(lambda x: float(x)/float(365))

    final_merge = final_merge.rename(columns={'DOING_BUSINESS_AS_NAME': 'Restaurant Name', 'Inspection_Date':'Failed inspection on','ADDRESS':'Address','diff':'Alive for x years'})
    final_merge.to_csv('biz_viability_out.csv', encoding='utf-8', index=False)







    print(yelp_integrated_frame)
    now = datetime.datetime.now()
    cur_year = now.year
                                             'SEG_NO':"SegNo",
                                             'SEG_LNGTH_FEET':"SegLenFt",
                                             "CUR_AADT":"CurAADT"})
 SegInfoData.columns
 SegInfoData.head()
 
 # Join data based on the begin and end segments 
 #************************************************************************************************************
 sqlcode = '''
 select Data1.ProjID, Data1.CountyCode, Data1.SR, Data1.BegSeg, Data1.BegOff, SegInfoData.SegNo, Data1.EndSeg
 from Data1
 left join SegInfoData on SegInfoData.CountyCode = Data1.CountyCode and SegInfoData.SR = Data1.SR
 and SegInfoData.SegNo = Data1.BegSeg
 '''
 
 TestDf = ps.sqldf(sqlcode,locals())
 TestDf.isna().sum()
 TestDf.groupby(['ProjID','CountyCode','SR','BegSeg','BegOff'])['BegSeg'].first().shape
 
 
 sqlcode = '''
 select Data1.ProjID, Data1.CountyCode, Data1.SR, Data1.BegSeg, SegInfoData.SegNo, Data1.EndSeg, 
 SegInfoData.SegLenFt, Data1.BegOff, Data1.EndOff, SegInfoData.CurAADT, SegInfoData.X_VALUE_BGN,
 SegInfoData.Y_VALUE_BGN, SegInfoData.X_VALUE_END, SegInfoData.Y_VALUE_END
 from Data1
 left join SegInfoData on SegInfoData.CountyCode = Data1.CountyCode and SegInfoData.SR = Data1.SR
 and SegInfoData.SegNo between Data1.BegSeg and Data1.EndSeg
 '''
 NewDf = ps.sqldf(sqlcode,locals())
 NewDf.isna().sum()
 NewDf.groupby(['ProjID','CountyCode','SR','BegSeg','BegOff'])['BegSeg'].first().shape
    def corrupt(self, X):
        """
    Note that here X is a pandas DataFrame
    """

        # create new corrupted data
        Y = X.copy()
        # cast category to object
        Y[self.categories_dict.keys()] = Y[self.categories_dict.keys()].apply(
            lambda x: x.astype('object'))

        # get means and standard deviations
        # (will be used as noise for the numericas, does not distort statistics)
        means_col = X.mean()
        stds_col = X.std()

        # add auxiliary index
        X['indexcol'] = X.index

        # break the cCFDs in the list
        for ccfd in self.ccfds:

            ## Get Rows which hold the cCFD constraint
            cfd_cond_str = to_str_CFD_SQL(ccfd.LHS, ccfd.RHS)
            sql_query = "SELECT {} FROM {} WHERE {};".format(
                "indexcol", "X", cfd_cond_str)
            df_res = sqldf(sql_query, locals())

            # Get categories and respective probabilities in the dataset,
            # if ccfd.RHS[0] feature is categorical
            if X[ccfd.RHS[0]].dtype.name == 'category':
                cats = [
                    t for t in self.categories_dict[ccfd.RHS[0]]
                    if t != ccfd.RHS[1]
                ]
                cats_probs = X[ccfd.RHS[0]].value_counts()[cats].values
                cats_probs = cats_probs / float(cats_probs.sum())

            ## Insert Right Hand Side Noise (to violate the constraint)
            for row_idx in df_res['indexcol']:

                if numpy.random.rand() <= self.p:

                    # is categorical
                    if X[ccfd.RHS[0]].dtype.name == 'category':
                        # choose other categories according to their proportion in the dataset
                        idx_cat = numpy.random.choice(len(cats), 1, False,
                                                      cats_probs)[0]
                        Y.set_value(row_idx, ccfd.RHS[0], cats[idx_cat])

                    # is integer
                    elif X[ccfd.RHS[0]].dtype.name in [
                            'int16', 'int32', 'int64'
                    ]:
                        # noise the cell using the mean of column (with a fraction of the standard deviation)
                        Y.set_value(
                            row_idx, ccfd.RHS[0],
                            int(means_col[ccfd.RHS[0]] +
                                0.10 * stds_col[ccfd.RHS[0]]))

                    # is float
                    elif X[ccfd.RHS[0]].dtype.name in [
                            'float16', 'float32', 'float64'
                    ]:
                        # noise the cell using the mean of column (with a fraction of the standard deviation)
                        Y.set_value(
                            row_idx, ccfd.RHS[0],
                            float(means_col[ccfd.RHS[0]] +
                                  0.10 * stds_col[ccfd.RHS[0]]))

                    # Add Typo if none of above
                    else:
                        # noise the cell using standard typo (e.g. unique/rare)
                        Y.set_value(row_idx, ccfd.RHS[0],
                                    "*" + ccfd.RHS[1] + "*")

        #Testing
        #for ccfd in self.ccfds:
        #  # Get Rows which hold the cCFD constraint
        #  cfd_cond_str = to_str_CFD_SQL(ccfd.LHS, ccfd.RHS)
        #  sql_query = "SELECT {} FROM {} WHERE {};".format("count(*)", "Y", cfd_cond_str)
        #  df_res = sqldf(sql_query, locals())
        #  print df_res

        # drop auxiliary index
        X.drop('indexcol', axis=1, inplace=True)

        return Y
Ejemplo n.º 35
0
def link_to_secid(df):
    """
    df should contain columns date and permno to get the match

    returns the same data frame with added column for OM secid
    """

    # Manually reading optionmetrics-crsp linking suite since there is
    # no dataset to download this from WRDS
    oclink = pd.read_csv(
        "estimated_data/crsp_data/optionmetrics_crsp_link.csv")

    # Getting the best link for each month end
    oclink = oclink[oclink.score < 6]
    oclink["sdate"] = [str(int(x)) for x in oclink["sdate"]]
    oclink["sdate"] = pd.to_datetime(oclink["sdate"], format="%Y%m%d")
    oclink["edate"] = [str(int(x)) for x in oclink["edate"]]
    oclink["edate"] = pd.to_datetime(oclink["edate"], format="%Y%m%d")

    q1 = """
    select 
        d.*,
        s1.secid as secid_1, 
        s2.secid as secid_2,
        s3.secid as secid_3,
        s4.secid as secid_4,
        s5.secid as secid_5
    from df as d
    
    left join (select secid, permno, sdate, edate from oclink where score = 1) as s1
    on d.permno = s1.permno and d.date >= s1.sdate and d.date <= s1.edate
    
    left join (select secid, permno, sdate, edate from oclink where score = 2) as s2
    on d.permno = s2.permno and d.date >= s2.sdate and d.date <= s2.edate
    
    left join (select secid, permno, sdate, edate from oclink where score = 3) as s3
    on d.permno = s3.permno and d.date >= s3.sdate and d.date <= s3.edate
    
    left join (select secid, permno, sdate, edate from oclink where score = 4) as s4
    on d.permno = s4.permno and d.date >= s4.sdate and d.date <= s4.edate
    
    left join (select secid, permno, sdate, edate from oclink where score = 5) as s5
    on d.permno = s5.permno and d.date >= s5.sdate and d.date <= s5.edate
    """

    tmp = sqldf(q1, locals())

    # Filtering and providing the best match:
    q2 = """
    select 
        *, COALESCE(secid_1, secid_2, secid_3, secid_4, secid_5) as secid
    from tmp
    """

    df = sqldf(q2, locals())
    df = df.drop(
        columns=["secid_1", "secid_2", "secid_3", "secid_4", "secid_5"])

    # Converting date columns to date format:
    df["date"] = pd.to_datetime(df["date"])

    return df
Ejemplo n.º 36
0
    def update_trace(self):
        self.load_mapped_data()
        self.filtered_data = self.data
        self.plot_data = self.filtered_data
        plot_samplerate = self.samplerate

        if self.cutoff_entry.get() != '' and self.order_entry != '':
            self.filter_data()
            self.plot_data = self.filtered_data
        if self.downsample_entry.get() != '':
            self.downsample_data()
            self.plot_data = self.downsampled_data
            plot_samplerate = float(self.downsample_entry.get())

        self.trace_fig.clf()
        a = self.trace_fig.add_subplot(111)

        if self.events_flag:
            db = self.ratefile
            start_time = self.start_time
            end_time = self.end_time
            good_start = np.squeeze(
                sqldf(
                    'SELECT start_time_s from db WHERE start_time_s >= {0} AND start_time_s < {1} AND type IN (0,1)'
                    .format(start_time, end_time), locals()).values) * 1e6
            bad_start = np.squeeze(
                sqldf(
                    'SELECT start_time_s from db WHERE start_time_s >= {0} AND start_time_s < {1} AND type>1'
                    .format(start_time, end_time), locals()).values) * 1e6
            good_end = np.squeeze(
                sqldf(
                    'SELECT end_time_s from db WHERE end_time_s >= {0} AND end_time_s < {1} AND type IN (0,1)'
                    .format(start_time, end_time), locals()).values) * 1e6
            bad_end = np.squeeze(
                sqldf(
                    'SELECT end_time_s from db WHERE end_time_s >= {0} AND end_time_s < {1} AND type>1'
                    .format(start_time, end_time), locals()).values) * 1e6

            for gs, ge in zip(np.atleast_1d(good_start),
                              np.atleast_1d(good_end)):
                a.axvspan(gs, ge, color='g', alpha=0.3)
            for bs, be in zip(np.atleast_1d(bad_start),
                              np.atleast_1d(bad_end)):
                a.axvspan(bs, be, color='r', alpha=0.3)

        time = np.linspace(1.0 / plot_samplerate,
                           len(self.plot_data) / plot_samplerate,
                           len(self.plot_data)) + self.start_time

        a.set_xlabel(r'Time ($\mu s$)')
        a.set_ylabel('Current (pA)')
        self.trace_fig.subplots_adjust(bottom=0.14, left=0.21)
        a.plot(time * 1e6, self.plot_data, '.', markersize=1)

        if self.baseline_flag:
            if self.config_cutoff != int(
                    self.cutoff_entry.get()) or self.config_order != int(
                        self.order_entry.get()):
                self.wildcard.set(
                    'Filter settings in config file do not match plotting filter settings, overlay will be inaccurate'
                )
            db = self.baseline_file
            start_time = self.start_time
            end_time = self.end_time
            times = np.squeeze(sqldf('SELECT time_s from db', locals()).values)
            times = np.sort(times)

            start_block = times[0]
            for time in times:
                if time <= start_time and time >= start_block:
                    start_block = time

            baseline_db = sqldf(
                'SELECT * from db WHERE time_s >= {0} and time_s < {1}'.format(
                    start_block, end_time), locals())
            times = baseline_db['time_s'].values
            means = baseline_db['baseline_pA'].values
            stdevs = baseline_db['stdev_pA'].values

            numblocks = len(means)
            for i in range(numblocks):
                if i == 0:
                    xmin = start_time
                else:
                    xmin = times[i]
                if i + 1 == numblocks:
                    xmax = end_time
                else:
                    xmax = times[i + 1]

                sign = np.sign(means[i])
                a.plot(
                    (xmin * 1e6, xmax * 1e6),
                    (means[i] - sign *
                     (self.threshold - self.hysteresis) * stdevs[i], means[i] -
                     sign * (self.threshold - self.hysteresis) * stdevs[i]),
                    '--',
                    color='y')
                a.plot((xmin * 1e6, xmax * 1e6),
                       (means[i] - sign * self.threshold * stdevs[i],
                        means[i] - sign * self.threshold * stdevs[i]),
                       '--',
                       color='y')
                a.plot((xmin * 1e6, xmax * 1e6), (means[i], means[i]),
                       '--',
                       color='black')

        self.trace_canvas.show()
Ejemplo n.º 37
0

x_normd = Norm_dist(x5)
Out_Reports_norm = x_normd[0]
x6 = x_normd[1]

PAgr = pd.merge(Profile_data,
                Out_Reports_norm,
                how='left',
                left_on=['Item_Name'],
                right_on=['Item_Name'])

###########################################Pagr table updation######################################
x6['updated'] = 0
Imputation = sqldf(
    """SELECT Item_Name,count(*) as Miss_days FROM x6  where Original is NULL group by Item_Name""",
    locals())
PAgr = pd.merge(PAgr,
                Imputation,
                how='left',
                left_on=['Item_Name'],
                right_on=['Item_Name'])  #

if PAgr.isnull().values.any(
):  # if any value having na in Pagr we are conver that value to 0
    I = PAgr['Miss_days'].index[PAgr['Miss_days'].apply(np.isnan)]
    PAgr['Miss_days'][I.values] = 0

PAgr['Act_AfterImput'] = PAgr["Total_" +
                              forecastMessure] + PAgr['Miss_days'].astype(int)
PAgr['PcImputation'] = round(PAgr.Miss_days / PAgr.Act_AfterImput, 2)
Ejemplo n.º 38
0
import pandasql as sql


df=pd.read_csv('https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/aadhaar_data.csv')

df.rename(columns = lambda x :  x.replace(' ','_').lower(),inplace=True)

####Above command replace the colum name ex: enrolment agency to enrolment_agency


query="""select state, sum(Aadhaar_generated)   from df  group by state limit 50 ; """

query1="""select  sum(case when gender='M' then 1 else 0 end) as  male,

sum(case when gender='F' then 1 else 0 end)  as fem
from df  where age > 50 ; """

query2 = """

select gender ,sum(Aadhaar_generated) from df  where age > 50 group by gender ;
"""

print sql.sqldf(query,locals())

print sql.sqldf(query1,locals())

print sql.sqldf(query2,locals())


Ejemplo n.º 39
0
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

tr['Cart'] = pd.cut(
    tr.Carats,
    bins=[0, 0.5, 1, 1.5, 2, 3, 4, 5, 15],
    labels=['0_0.5', '0.5_1', '1-1.5', '1.5-2', '2-3', '3-4', '4-5', '5+'],
    include_lowest=True)

final = pd.crosstab(tr.Vendor, tr.Cart, tr.Price, aggfunc="mean")
final

final2 = pd.crosstab(tr.Cert, tr.Cart, tr.Profit_flg, aggfunc="sum")
final2

q = """SELECT Cert , Profit_flg ,avg(Profit) as Avg_Profit ,count(0) as freq FROM tr group by Cert , Profit_flg;"""
print(ps.sqldf(q, locals()))

q = """SELECT Vendor , Cart ,avg(Price) as Avg_Pr ,count(0) as freq FROM tr group by Vendor, Cart;"""
print(ps.sqldf(q, locals()))
q = """SELECT Shape ,count(0) as count FROM tr group by Shape;"""
print(ps.sqldf(q, locals()))

#remove white space
tr['Shape1'] = tr['Shape'].str.replace(r'\s+', '')

tr['Shape2'] = pd.np.where(
    tr.Shape1.str.contains('Marquis|Marwuise'), 'Marquise',
    pd.np.where(tr.Shape1.str.contains('ROUND'), 'Round', tr['Shape1']))
tr['Shape'] = tr['Shape2']
del (tr['Shape1'], tr['Shape2'])
import pandas as pd
import numpy as np
from numpy import unique
from datetime import datetime
import pymysql
pymysql.install_as_MySQLdb()
import MySQLdb
import pandas.io.sql as psql
import sqlalchemy as sq
from sqlalchemy import create_engine
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import re
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
from time import gmtime, strftime
import datetime

db = pymysql.connect('34.214.211.162', 'mobiloansteam', 'team123456',
                     'mobiloans')
cur = db.cursor(pymysql.cursors.DictCursor)
UserId = 'mobiloansteam'

auto_dialer = pd.read_sql(
    "SELECT * FROM mobiloans_auto_dialer where date >= '20180306' and date <= '20180405' ",
    con=db)
manual_dialer = pd.read_sql(
    "SELECT * FROM mobiloans_manual_dialer where date >= '20180306' and date <= '20180405' ",
    db)
sql_payment = pd.read_sql(
    "select loan_number AS AccountNumber,count(transaction_amount) AS Count_payment,sum(transaction_amount) AS payment FROM mobiloans.mobiloans_payment_file WHERE transaction_type_description='Payment' and transaction_effective_date >= '2018-03-06' and transaction_effective_date <= '2018-04-05' GROUP BY loan_number ",
def print_result():
    # Execute your SQL command against the pandas frame
    rainy_days = pandasql.sqldf(q.lower(), globals())
    print(rainy_days)
Ejemplo n.º 42
0
# encoding=utf-8
"""常用的sql"""

from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, IntegerType, LongType, StructType
import pandasql
import numpy as np
import pandas as pd
import os
schema = [("name", ""), ("age", 0.0)]
data = [("a", 1), ("b", 2)]

pysqldf = lambda sql: pandasql.sqldf(sql, globals()) # sql 查询引擎
test_df = pd.DataFrame(np.array([[1,2], [3,4]]))
test_df.columns = ["a", "b"]
pysqldf("select * from test_df") # test_df为df的变量名称

def _build_field(name_type):
    type_map = {
        str: StructField(name_type[0], StringType(), True),
        int: StructField(name_type[0], IntegerType(), True),
        float: StructField(name_type[0], LongType(), True)
    }
    return type_map.get(type(name_type[1]))


def build_df_by_schema(rdd, schemas, tb_name=None):
    spark = SQL.spark
    df_schema = StructType([_build_field(x) for x in schemas])
    df_rdd = spark.createDataFrame(rdd, df_schema)
    if tb_name:
Ejemplo n.º 43
0
                labels=dict_labels,
                title="Fig 2: Curva de contagios en Punta Arenas")
fig_2.update_layout(showlegend=False)
data_g3 = data_comunas[(filt_reg) & (filt_not_pa) ] \
            .groupby(["fecha", "region", "comuna"])["casos"] \
            .sum() \
            .reset_index()
fig_3 = px.line(data_g3,
                x="fecha",
                y="casos",
                color="comuna",
                labels=dict_labels,
                title="Fig 3: Curva de contagios fuera de Punta Arenas")
fig_3.update_layout(showlegend=True)
p_4 = "Quizás, más ilustrativo sea ver la velocidad de aumento de casos en las comunas de la región:"
comunas_unique = ps.sqldf("SELECT DISTINCT comuna FROM data_comunas WHERE region = 'Magallanes'") \
                    .to_dict(orient = "records")
data_g4 = pd.DataFrame()
for comuna in data_comunas[filt_reg].comuna.unique():
    data_tmp = data_comunas[(filt_reg) & (data_comunas.comuna == comuna)] \
                .groupby(["fecha", "region", "comuna"])["casos"] \
                .sum() \
                .diff(periods = 1) \
                .reset_index()
    data_g4 = data_g4.append(data_tmp)
    del data_tmp
fig_4 = px.line(
    data_g4,
    x="fecha",
    y="casos",
    color="comuna",
    labels=dict_labels,
Ejemplo n.º 44
0
def deaths():
    df = pd.read_csv(deaths_path)
    return sqldf("SELECT * FROM df").to_json()
Ejemplo n.º 45
0
def cases():
    df = pd.read_csv(cases_path)
    return sqldf("SELECT * FROM df").to_json()
Ejemplo n.º 46
0
    This function should run a SQL query on a dataframe of
    weather data.  The SQL query should return two columns and
    two rows - whether it was foggy or not (0 or 1) and the max
    maxtempi for that fog value (i.e., the maximum max temperature
    for both foggy and non-foggy days).  The dataframe will be 
    titled 'weather_data'. You'll need to provide the SQL query.
    
    You might also find that interpreting numbers as integers or floats may not
    work initially.  In order to get around this issue, it may be useful to cast
    these numbers as integers.  This can be done by writing cast(column as integer).
    So for example, if we wanted to cast the maxtempi column as an integer, we would actually
    write something like where cast(maxtempi as integer) = 76, as opposed to simply 
    where maxtempi = 76.
    
    You can see the weather data that we are passing in below:
    https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv
'''
import pandas
import pandasql

filename = 'A:\DataScience\Intro_to_DS\weather-underground.csv'
weather_data = pandas.read_csv(filename)

q = """
SELECT fog, MAX (maxtempi) FROM weather_data GROUP by fog
"""

#Execute your SQL command against the pandas frame
foggy_days = pandasql.sqldf(q.lower(), locals())
print(foggy_days)
Ejemplo n.º 47
0
def load_data():
    # This needs to be migrated, perhaps AWS configuration manager?
    host = "localhost"
    dbname = "test"
    user = "******"
    password = "******"

    conn = pymysql.connect(host, user=user, passwd=password, db=dbname)

    logger = get_logger("PreProcessing")
    logger.info("Loading Data")

    # Create variable for pandasql
    pysqldf = lambda q: sqldf(q, globals())

    # start = timeit.default_timer()
    # cursor = conn.cursor()
    # cursor.execute('select * from matches');
    # rows = cursor.fetchall()
    # matches = pd.DataFrame( [[ij for ij in i] for i in rows] )
    # stop = timeit.default_timer()
    #
    # print ("Cursor execute")
    # print(stop - start)

    cursor = conn.cursor()
    start = timeit.default_timer()

    # #----------------------------------------------
    # # Load users table, remove admin type users
    logger.info("Loading User Data")

    # Users can be associated to more than one community, if adding communities here users will have duplicated values
    # when using users later on, drop community column and drop duplicates

    # users = pd.read_sql('SELECT user_id, first_name, last_name, commute_mode, neighborhood, created_at as registration_date,\
    #                     main_email, \
    #                     CASE WHEN (user_id not in(select driver_id from rides)\
    #                                 AND user_id not in \
    #                                 (select user_id from passengers) ) THEN "REGISTERED"\
    #                                 ELSE "ACTIVE" END  AS user_type FROM users \
    #                     WHERE  type != "admin" AND validated_email=1 AND main_email NOT IN \
    #                         (SELECT main_email FROM (\
    #                             SELECT main_email , count(user_id) as dup FROM users GROUP BY 1 \
    #                                 HAVING dup>1) as a);', con=conn)

    cursor.execute("""
                SELECT a.user_id, a.first_name, a.last_name, a.commute_mode, a.neighborhood, a.created_at as date,
                        a.main_email, c.name as community,
                CASE WHEN (a.user_id not in(select driver_id from rides) AND a.user_id not in 
                (select user_id from passengers) ) THEN "REGISTERED"
                ELSE "ACTIVE" END AS user_type 
                FROM users as a
                LEFT JOIN user_communities as b
                ON a.user_id = b.user_id
                LEFT JOIN communities as c
                ON b.community_id = c.id
                WHERE  type != "admin" AND validated_email=1 AND main_email NOT IN 
                            (SELECT main_email FROM (
                                SELECT main_email , count(user_id) as dup FROM users GROUP BY 1 
                                    HAVING dup>1) as a);

    """)
    rows = cursor.fetchall()
    users = pd.DataFrame([[ij for ij in i] for i in rows])
    users.columns = [
        'user_id', 'first_name', 'last_name', 'commute_mode', 'neighborhood',
        'date', 'main_email', 'community', 'user_type'
    ]

    # add new id
    users["new_id"] = users.index + 1
    users['date'] = pd.to_datetime(users['date'])
    users['reg_date_ym'] = users.date.dt.to_period('M')
    users.index = users['date']

    # #-----------------------------------------------
    # Look for duplicated users, remove from further queries, most likely these are all admin
    dup_users1 = pd.read_sql('SELECT user_id FROM users \
                            WHERE main_email IN \
                            (SELECT main_email FROM (\
                                SELECT main_email , count(user_id) as dup FROM users GROUP BY 1 \
                                    HAVING dup>1) as a);',
                             con=conn)

    dup_users = dup_users1['user_id'].values.tolist()
    format_strings = ','.join(['%s'] * len(dup_users))

    # #------------------------------------------------
    # # Create the matches table, join with the rides, passengers and community
    # logger.info("Loading Matches Data")

    logger.info("Loading Matches Data")

    cursor.execute(
        """
    SELECT d.name as community, a.ride_id, a.date, b.hour, a.created_at as publication_date,  b.driver_id,
            c.user_id as passenger_id, coalesce(c.created_at, NULL) as match_date,
            b.type, b.seats, b.begin_location_gps,
            b.end_location_gps, b.distance_value,
            a.updated_at, c.updated_at AS pass_updated_at,
            YEAR(a.date) as ride_year, MONTH(a.date) as ride_month,
            WEEK(a.date) as ride_week, DAYOFWEEK(a.date) as ride_dow, DAY(a.date) as ride_day, HOUR(b.hour) as ride_hour
    FROM ride_dates AS a
    JOIN rides  AS b  ON a.ride_id = b.ride_id
    JOIN passengers as c
    ON a.ride_id = c.ride_id
    AND a.date = c.date
    JOIN communities as d
    ON b.community_id = d.id
    WHERE a.deleted_at IS NULL
    AND c.user_id not in (%s)
    """ % format_strings, tuple(dup_users))

    rows = cursor.fetchall()
    matches = pd.DataFrame([[ij for ij in i] for i in rows])

    # Add column names
    matches.columns = [
        'community', 'ride_id', 'date', 'hour', 'publication_date',
        'driver_id', 'passenger_id', 'match_date', 'type', 'seats',
        'begin_location_gps', 'end_location_gps', 'distance_value',
        'updated_at', 'pass_updated_at', 'ride_year', 'ride_month',
        'ride_week', 'ride_dow', 'ride_day', 'ride_hour'
    ]

    #Standarise date types
    matches['date'] = pd.to_datetime(matches['date'])
    matches['publication_date'] = pd.to_datetime(matches['publication_date'])
    matches['match_date'] = pd.to_datetime(matches['match_date'])
    matches.index = matches['date']
    #df.resample('M').agg(dict(score='count'))

    #matches['ride'] = matches.date.dt.to_period('M')
    #matches['year_week'] = matches.date.dt.to_period('W')

    #
    #
    #-------------------------------------------
    # Get only valid and clean rides

    logger.info("Loading Rides Data")

    cursor.execute(
        """
    SELECT d.name as community, a.ride_id, a.date, b.hour, a.created_at as publication_date, b.driver_id,
                    b.type, b.seats, b.begin_location_gps, b.end_location_gps, b.distance_value,
                    a.updated_at, YEAR(a.date) as ride_year, MONTH(a.date) as ride_year,  
                    WEEK(a.date) as ride_week, DAYOFWEEK(a.date) as ride_dow, DAY(a.date) as ride_day, 
                    HOUR(b.hour) as ride_hour
    FROM ride_dates AS a
    JOIN rides  AS b  ON a.ride_id = b.ride_id
    JOIN communities as d
    ON b.community_id = d.id
    WHERE a.deleted_at IS NULL 
    AND b.driver_id not in (%s)
    """ % format_strings, tuple(dup_users))

    rows = cursor.fetchall()
    rides = pd.DataFrame([[ij for ij in i] for i in rows])
    # Add column names
    rides.columns = [
        'community', 'ride_id', 'date', 'hour', 'publication_date',
        'driver_id', 'type', 'seats', 'begin_location_gps', 'end_location_gps',
        'distance_value', 'updated_at', 'ride_year', 'ride_month', 'ride_week',
        'ride_dow', 'ride_day', 'ride_hour'
    ]

    #Standarise date types
    rides['date'] = pd.to_datetime(rides['date'])
    rides.index = rides['date']

    rides['year_month'] = rides.date.dt.to_period('M')

    logger.info("Finish Loading Data")

    stop = timeit.default_timer()
    logger.info(stop - start)
    conn.close()

    #-------------------------------------------------------
    #df2 = users.groupby('commute_mode').resample("M").count()

    return users, rides, matches


#matches_day = matches.groupby(['community','ride_year','ride_month'])

#mujeres = pd.read_csv("/Users/natisangarita/TryMyRide/mujeres.csv")
#hombres = pd.read_csv("/Users/natisangarita/TryMyRide/hombres.csv")

#select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc
#airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])
Ejemplo n.º 48
0
#backing out tweet volume
for i in range(len(df)):
    count = 0
    start = 0
    if i > 175:
        start = i - 175
    for j in range(start,i):
        if df.loc[i,'elapsed'] - df.loc[j,'elapsed'] < .25 and df.loc[i,'elapsed'] - df.loc[j,'elapsed'] > 0:
            count += 1
    vol_list.append(count)


#finalizing df
df['tweet_volume'] = vol_list
df['audio_volume'] = df['tweet_volume']/df.tweet_volume.max()
df = sqldf("SELECT * FROM df WHERE elapsed <= 90.1")


#Plotting
plt.style.use('ggplot')
fig, ax = plt.subplots()
#Creating a histogram of tweet volume
ax.hist(df.elapsed, bins = 240,alpha=0.4,label='Tweets Sent',color='b')
#A line plot of audio volume
ax.plot(df.elapsed,df.audio_volume*100, color='g',label='Audio Volume (%)',alpha=.95)
#Manual title
ax.set(title='Tweet Activity/Audio Volume: Bremen v. Leverkusen')
ax.set_xlabel('Minutes since start')
ax.set_ylim(0, 230)
ax.set_xlim(0, 91)
ax.legend()
Ejemplo n.º 49
0
    if pos == -1:
        pos = sqlfilter.find('from')
        if pos == -1:
            # SQL format not supported
            sqlfilter = ''
    if pos > -1:
        # found FROM, now find WHERE
        pos2 = sqlfilter.find('WHERE')
        if pos2 == -1:
            pos2 = sqlfilter.find('where')
            if pos2 == -1:
                # SQL format not supported
                sqlfilter = ''
        if pos2 > -1:
            # found FROM and WHERE, create new SQL query with netflowData as the table name
            sqlfilter = sqlfilter[:pos +
                                  4] + ' netflowData ' + sqlfilter[pos2:]

netflowData = pd.read_csv(fileName)

# Reduce the NetFlow data frame using the SQL query
if (sqlfilter != '') and ((sqlfilter[:6] == 'SELECT') or
                          (sqlfilter[:6] == 'select')):
    print('Applying SQL filter')
    netflowData = pds.sqldf(sqlfilter, globals())

sns.set(style="white", color_codes=True)
sns_plot = sns.jointplot(x=sys.argv[2], y=sys.argv[3], data=netflowData)
sns_plot.fig.suptitle("Bivariate Distribution for " + fileName.split("\\")[-1],
                      y=1.08)
sns_plot.savefig("images/bivariate.svg")
Ejemplo n.º 50
0
cliente=pd.read_csv('cliente.csv')
producto=pd.read_csv('producto.csv')
tiempo=pd.read_csv('tiempo.csv')

"""En primera instancia vamos a descartar a las personas que ya tienen hipoteca. Pues dado que ya tienen una deuda buscaremos que la cubran para poder ofrecerles otro prestamo hipotecario"""

q1="""SELECT cliente.cliente_id, cliente.nombre_cliente, cliente.ingreso,
producto.tipo_producto_desc,producto.producto_id,producto.producto_desc,
tiempo.fecha, hechos.saldo_$ as saldo,producto.numero_cuenta
FROM hechos
JOIN cliente ON cliente.llave_cliente == hechos.llave_cliente
JOIN producto ON producto.llave_producto == hechos.llave_producto
JOIN tiempo ON tiempo.llave_tiempo == hechos.llave_tiempo
WHERE producto.tipo_producto_desc != "Mortgage"
"""
data = ps.sqldf(q1,locals())

#Cambiamos el tipo de dato a fecha para operar con esta columna.
data['fecha'] = pd.to_datetime(data['fecha'], format='%d/%m/%Y')
data.sort_values(by='fecha')
#data.index = data['fecha']

"""Dado que ya sabemos las personas que tienen prestamos hipotecarios, veremos el comportamiento de los saldos de las personas que aun son candidatas para ofrecerles un prestamo"""

q2="""SELECT cliente.cliente_id, cliente.nombre_cliente, cliente.ingreso,
producto.tipo_producto_desc,producto.producto_id,producto.producto_desc,
tiempo.fecha, hechos.saldo_$ as saldo,producto.numero_cuenta
FROM hechos
JOIN cliente ON cliente.llave_cliente == hechos.llave_cliente
JOIN producto ON producto.llave_producto == hechos.llave_producto
JOIN tiempo ON tiempo.llave_tiempo == hechos.llave_tiempo
@author: elara
"""



import platform
import pandas as pd
from pandasql import sqldf 
import gensim
import logging
import multiprocessing
import itertools
import numpy as np
cores = multiprocessing.cpu_count()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
pysqldf = lambda q: sqldf(q, globals())

if platform.system() == 'Linux':
    main_path = '/mnt/c/'
if platform.system() == 'Windows':
    main_path = 'C:/'
    
test_corpus_path = main_path+ 'Elara/Documents/paper/LDA/lda_test.csv'
test_text = pd.read_csv(test_corpus_path,encoding='utf-8',engine='python',names = ['i','content'])

i = list(set(test_text['i']))[0]
texts =[i.split() for i in  test_text.loc[test_text['i']==i]['content']]
dictionary = gensim.corpora.Dictionary(texts)
text_train = [dictionary.doc2bow(i) for i in texts]
lda1 = gensim.models.ldamodel.LdaModel(corpus=text_train, num_topics=165, id2word=dictionary, 
                              distributed=False, chunksize=2000, passes=20
Ejemplo n.º 52
0
    data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Quasi']

if model_type == "credit_batch":
    data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Batch']

if model_type == "debit_pin":
    data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Pin']

if model_type == "debit_sign":
    data = data[data['FRAUD_SCORE_TYPE_CD'] == 'Signature']

data.to_csv(out_dir + model_type + '_' + "data.csv")



pysql = lambda q: pdsql.sqldf(q, globals())

# Create overall dataframe
 
q_f_all = '''
          select   MODEL_SCORE_NR,
                   MODEL_SCORE_NR_SCALE,
                   MODEL_SCORE_NR_SCALE_INV,
                   MODEL_SCORE_NR_BIN,
                  sum(NUM_TXNS_VALID) as SUM_NUM_TXNS_VALID,
                  sum(NUM_TXNS_FRAUD) as SUM_NUM_TXNS_FRAUD,
                  sum(NUM_TXNS) as SUM_NUM_TXNS,
                  sum(SALES_VALID) as SUM_SALES_VALID,
                   sum(SALES_FRAUD) as SUM_SALES_FRAUD,
                  sum(SALES) as SUM_SALES,
                  sum(APPROVED_TRANS_VALID) as SUM_APPROVED_TRANS_VALID,
Ejemplo n.º 53
0
diabetes_df.describe()
diabetes_df.groupby("readmitted").size()

#Find null or missing
diabetes_df.isnull().sum()
diabetes_df.isna().sum()

#Exploratory analysis time 
# How many encounters by patient
q="""Select 
     patient_nbr,
     count(distinct encounter_id) as encounters
     from diabetes_df
     group by 1
     order by 2 desc"""
patient_encounters = sqldf(q)
       
q="""Select AVG(encounters) as average_encounters from (Select 
     patient_nbr,
     count(distinct encounter_id) as encounters
     from diabetes_df
     group by 1
     order by 2 desc)"""
avg_patient_encounters = sqldf(q)

#Mean figures medications etc
q = """Select 
       AVG(num_procedures) as mean_num_procedures,
       AVG(num_medications) as mean_num_medications,
       AVG(num_lab_procedures) as mean_num_lab_procedures,
       AVG(time_in_hospital) as average_time_in_hospital,
Ejemplo n.º 54
0
def state_names():
    q="SELECT DISTINCT State FROM df"
    pysqldf = lambda q: sqldf(q, globals())
    a_df = pysqldf(q)
    return jsonify(list(a_df.State))
Ejemplo n.º 55
0
def Total_holdings(primary_key, secret_key):
    if (KeyFormModel):
        key = primary_key
        secret = secret_key

        # python3
        secret_bytes = bytes(secret, encoding='utf-8')
        # python2
        #secret_bytes = bytes(secret)

        # Generating a timestamp.
        timeStamp = int(round(time.time() * 1000))

        body = {"timestamp": timeStamp}

        json_body = json.dumps(body, separators=(',', ':'))

        signature = hmac.new(secret_bytes, json_body.encode(),
                             hashlib.sha256).hexdigest()

        url = "https://api.coindcx.com/exchange/v1/users/balances"

        headers = {
            'Content-Type': 'application/json',
            'X-AUTH-APIKEY': key,
            'X-AUTH-SIGNATURE': signature
        }

        response = requests.post(url, data=json_body, headers=headers)
        data = response.json()
        curr_list = []
        for i in data:
            if (i["balance"] != '0.0'):
                curr_list.append(i["currency"])

        # Enter your API Key and Secret here. If you don't have one, you can generate it from the website.

        # python3
        secret_bytes = bytes(secret, encoding='utf-8')

        # Generating a timestamp
        timeStamp = int(round(time.time() * 1000))

        body = {"timestamp": timeStamp}

        json_body = json.dumps(body, separators=(',', ':'))

        signature = hmac.new(secret_bytes, json_body.encode(),
                             hashlib.sha256).hexdigest()

        url = "https://api.coindcx.com/exchange/v1/users/balances"

        headers = {
            'Content-Type': 'application/json',
            'X-AUTH-APIKEY': key,
            'X-AUTH-SIGNATURE': signature
        }
        currency = []
        balance = []
        locked_balance = []
        response = requests.post(url, data=json_body, headers=headers)
        data = response.json()
        for curr in data:
            if (curr['balance'] != '0.0'
                    and curr['currency'] not in ['ALGO', 'INR']):
                balance.append(curr['balance'])
                currency.append(curr['currency'] + 'INR')
                locked_balance.append(curr['locked_balance'])

        PORTFOLIO = pd.DataFrame({
            'CURRENCY': currency,
            'BALANCE': balance,
            'LOCKED_BALANCE': locked_balance
        })

        url = "https://api.coindcx.com/exchange/ticker"

        response = requests.get(url)
        data = response.json()
        market = []
        last_price = []

        for currdetail in data:
            if (currdetail['market'] in currency):
                market.append(currdetail['market'])
                last_price.append(currdetail['last_price'])
        market_ticker = go.Figure(data=[
            go.Table(header=dict(values=['market', 'last_price']),
                     cells=dict(values=[market, last_price]))
        ])

        LAST_PRICE_TABLE = pd.DataFrame({
            'MARKET': market,
            'LAST_PRICE': last_price
        })

        TOTAL = ps.sqldf('''
                        SELECT A.MARKET,
                            A.LAST_PRICE,
                            B.BALANCE as ACTIVE_BALANCE,
                            B.LOCKED_BALANCE,
                            (B.BALANCE+LOCKED_BALANCE) as TOTAL_BALANCE,
                            (LAST_PRICE*(B.BALANCE+LOCKED_BALANCE)) as TOTAL_HOLDING
                            FROM LAST_PRICE_TABLE A JOIN PORTFOLIO B ON A.MARKET=B.CURRENCY'''
                         )
        PIE = ps.sqldf(''' SELECT MARKET,TOTAL_HOLDING FROM TOTAL''')
        dfpair = {'total_df': TOTAL, 'pie_df': PIE}
        return (dfpair)
Ejemplo n.º 56
0
# Add headers and interpret the last column as a date, extract year of purchase
data.columns = ['customer_id', 'purchase_amount', 'date_of_purchase']
data['date_of_purchase'] = pd.to_datetime(data.date_of_purchase)
data['days_since'] = (pd.Timestamp('2016-01-01') -
                      data['date_of_purchase']).dt.days

# Display the data after transformation
data.head()
data.describe()

# Compute key marketing indicators using SQL language

# Compute recency, frequency, and average purchase amount
customers = sqldf(
    "SELECT customer_id, MIN(days_since) AS 'recency', COUNT(*) AS 'frequency', AVG(purchase_amount) AS 'amount' FROM data GROUP BY 1",
    globals())

# Explore the data
customers.head()
customers.describe()
customers.recency.hist(bins=20)
customers.frequency.hist(bins=24)
customers.amount.hist()
customers.amount.hist(bins=99)

# --- PREPARING AND TRANSFORMING DATA ----------------------

# Copy customer data into new data frame
new_data = customers
Ejemplo n.º 57
0
import geoplotlib
from pandasql import sqldf
import pandas as pd

df = pd.read_csv('data.csv')
destco = pd.read_csv('destination_latlong.csv')
orgco = pd.read_csv('origin_latlong.csv')
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], format='%Y-%m-%d')

airline = df.set_index('FL_DATE')

q = """
    SELECT A.ORIGIN_CITY_NAME,A.DEST_CITY_NAME,DC.lat As dlat,DC.lon as dlon,OC.lat As olat,OC.lon as olon FROM airline as A join destco as DC on A.DEST_CITY_NAME=DC.DEST_CITY_NAME join orgco as OC on A.ORIGIN_CITY_NAME=OC.ORIGIN_CITY_NAME WHERE DEP_DELAY=0 AND ARR_DELAY=0;
            
    """
print 'working..'
result = sqldf(q, locals())

geoplotlib.graph(result,
                 src_lat='olat',
                 src_lon='olon',
                 dest_lat='dlat',
                 dest_lon='dlon',
                 color='rainbow',
                 alpha=32,
                 linewidth=2)

geoplotlib.savefig('no delay')
Ejemplo n.º 58
0
def merge_and_filter_ind_disaster(days, var, min_obs_in_month,
                                  min_share_month):
    ########################################################################
    # Loading interpolated measures according to the specified number of days
    # of interpolation

    file_name = "estimated_data/interpolated_D/int_ind_disaster_days_" + str(
        days) + ".csv"
    D_df = pd.read_csv(file_name)

    # Dealing with dates:
    D_df["date"] = pd.to_datetime(D_df["date"])
    D_df["date_adj"] = D_df["date"] + pd.offsets.MonthEnd(0)
    D_df = D_df.drop("date", axis=1)

    ########################################################################
    # Limiting to companies with at least 15 observations in a month in at least 80%
    # months in the sample from January 1996 to December 2017.
    def min_month_obs(x):
        return x[var].count() > min_obs_in_month

    D_filter_1 = D_df.groupby(["secid", "date_adj"]).filter(min_month_obs)
    D_mon_mean = D_filter_1.groupby(["secid", "date_adj"]).mean().reset_index()

    num_months = len(np.unique(D_mon_mean["date_adj"]))

    def min_sample_obs(x):
        return x[var].count() > num_months * min_share_month

    D_filter = D_mon_mean.groupby("secid").filter(min_sample_obs)

    ########################################################################
    # Loading data on monthly return and linking data:
    ret_df = pd.read_csv("estimated_data/crsp_data/crsp_monthly_returns.csv")
    ret_df["MV"] = ret_df["prc"] * ret_df["shrout"]
    oclink = pd.read_csv(
        "estimated_data/crsp_data/optionmetrics_crsp_link.csv")

    # Getting the best link for each month end of D-clamp:
    oclink = oclink[oclink.score < 6]
    oclink["sdate"] = [str(int(x)) for x in oclink["sdate"]]
    oclink["sdate"] = pd.to_datetime(oclink["sdate"], format="%Y%m%d")
    oclink["edate"] = [str(int(x)) for x in oclink["edate"]]
    oclink["edate"] = pd.to_datetime(oclink["edate"], format="%Y%m%d")

    q1 = """
    select 
        d.*,
        s1.permno as permno_1, 
        s2.permno as permno_2,
        s3.permno as permno_3,
        s4.permno as permno_4,
        s5.permno as permno_5
    from D_filter as d
    left join (
        select 
            secid, permno, sdate, edate
        from oclink
        where score = 1
    ) as s1
    on d.secid = s1.secid
    and d.date_adj >= s1.sdate
    and d.date_adj <= s1.edate
    left join (
        select 
            secid, permno, sdate, edate
        from oclink
        where score = 2
    ) as s2
    on d.secid = s2.secid
    and d.date_adj >= s2.sdate
    and d.date_adj <= s2.edate
    left join (
        select 
            secid, permno, sdate, edate
        from oclink
        where score = 3
    ) as s3
    on d.secid = s3.secid
    and d.date_adj >= s3.sdate
    and d.date_adj <= s3.edate
    left join (
        select 
            secid, permno, sdate, edate
        from oclink
        where score = 4
    ) as s4
    on d.secid = s4.secid
    and d.date_adj >= s4.sdate
    and d.date_adj <= s4.edate
    left join (
        select 
            secid, permno, sdate, edate
        from oclink
        where score = 5
    ) as s5
    on d.secid = s5.secid
    and d.date_adj >= s5.sdate
    and d.date_adj <= s5.edate
    """

    tmp = sqldf(q1, locals())

    # Filtering and providing the best match:
    q2 = """
    select 
        *,
        COALESCE(permno_1, permno_2, permno_3, permno_4, permno_5) as permno
    from tmp
    """

    disaster_ret_df = sqldf(q2, locals())
    disaster_ret_df = disaster_ret_df.drop(
        ["permno_1", "permno_2", "permno_3", "permno_4", "permno_5"], axis=1)

    # Merging with returns next month:
    disaster_ret_df = disaster_ret_df.rename({"date_adj": "date"}, axis=1)
    disaster_ret_df["date"] = pd.to_datetime(disaster_ret_df["date"])
    disaster_ret_df[
        "month_lead"] = disaster_ret_df["date"] + pd.offsets.MonthEnd(1)
    disaster_ret_df = disaster_ret_df.drop("date", axis=1)

    ret_df["date"] = pd.to_datetime(ret_df["date"])
    ret_df["date"] = ret_df["date"] + pd.offsets.MonthEnd(0)

    # Merging this month's disaster variable with next month's return on the stock
    disaster_ret_df = pd.merge(disaster_ret_df,
                               ret_df[["date", "permno", "ret"]],
                               left_on=["permno", "month_lead"],
                               right_on=["permno", "date"],
                               how="left")

    # Merging this month's disaster variable, next month's return on the stock
    # with this month's market value = |PRC|*SCHROUT
    disaster_ret_df = pd.merge(disaster_ret_df,
                               ret_df[["date", "permno", "MV"]],
                               left_on=["permno", "date"],
                               right_on=["permno", "date"],
                               how="left")

    return disaster_ret_df
Ejemplo n.º 59
0
#print(data)
#print(data.describe())
#print(data[['Fare']].apply(np.mean))
#['Survived'],['Fare'],['Sex']
df = pd.DataFrame(data = rdata, columns=['Fare', 'Sex','Age','Survived'])
#print(df.describe())
#print(df)
#df['Age'] = df['Age'].fillna(df['Age'].mean())  #
#print(df)

df['Age'] = df['Age'].fillna(np.mean(df['Age']))

print(df)
print(np.sum(df['Age'])) 
print(np.mean(df['Age'])) 
'''

aadhaar_data = pd.read_csv(
    "C:\\Users\\berag\\SampleProject\\DataScience\\aadhaar_data.csv")
aadhaar_data.rename(columns=lambda x: x.replace(' ', '_').lower(),
                    inplace=True)

q = """ SELECT * FROM aadhaar_data LIMIT 50
    -- SQL Comment
    """
#aadhaar_solution = pandasql.sqldf(q.lower(), locals())
aadhaar_solution = pandasql.sqldf(q, globals())
b = pd.DataFrame(data=aadhaar_solution)
print(b)
Ejemplo n.º 60
0
def runBoostingRegressorWithSubstrings_and_Times(amount_of_runs, host_name,
                                                 root_name, passw_root,
                                                 database_name, query):
    total_true = 0  # the amount of correctly predicted pass/fail of the sum of both languages.
    total_prolog = 0  # the amount of correctly predicted pass/fail of prolog.
    total_haskell = 0  # the amount of correctly predicted pass/fail of haskell.
    total_avg_deviation = 0  # the sum of the average deviation of each run.
    total_avg_deviation_both = 0
    length_prediction_list = 1  # the amount of predictions made each run.

    query_result = Database_Functions.query_database_dataframe(
        host_name, root_name, passw_root, database_name,
        query)  # this is a dataframe with the needed data
    query_result, big_dict, time_dict = preprocessing_2(query_result)

    query_result = pandasql.sqldf(Queries.get_query_09_1819_df("query_result"),
                                  locals())

    grades = query_result[['user_id', 'score_prolog',
                           'score_haskell']].drop_duplicates(subset='user_id')
    # this is a dataframe with all user_id's and all scores
    grades.reset_index(
        drop=True, inplace=True
    )  # we reset the number index of the dataframe (purely cosmetics)
    possible_categories = query_result.query(
        'language==1')['category'].unique()
    # gras = query result + Time Dict.
    query_result = integrate_times_into_df(time_dict, query_result)
    # selecting only prolog as cat
    # possible_categories = query_result['category'].unique()

    # preprocessing(host_name, root_name, passw_root, database_name, Queries.get_query_06_)
    big_result_list = []
    for x in range(
            amount_of_runs):  # in this loop the experiment gets repeated
        print("run number " + str(x))
        verification_df = grades.sample(
            frac=0.1)  # this is a random selection of 10% of the dataframe
        train_df = grades.drop(
            verification_df.index
        )  # we drop the sample that we have selected to retain 90% to train

        training_users = set(train_df['user_id'].tolist()
                             )  # a set of all selected training-users
        verification_users = set(verification_df['user_id'].tolist())
        relevant_subset, total_freq_subset = get_relevant_subset(
            training_users, big_dict)
        trees, frequency_list_df_training = TreeConstructor.create_trees_with_subsets(
            train_df, relevant_subset, total_freq_subset)
        data_points_training_df = query_result.iloc[np.where(
            query_result.user_id.isin(training_users))]
        # we have one boosting trees per category from create_trees_with_subsets, we now predict one score per
        # user and append this to the dataframe.
        data_points_training_df = add_freq_predictions_to_df(
            trees, data_points_training_df, frequency_list_df_training)
        frequency_list_df_ver = make_frequency_list_df(big_dict,
                                                       verification_users,
                                                       total_freq_subset)

        # A dataframe of all submissions of the selected users.
        data_points_verification_df = query_result.drop(
            data_points_training_df.index)
        # we drop the selected training data to form the verification data
        data_points_verification_df = add_freq_predictions_to_df(
            trees, data_points_verification_df, frequency_list_df_ver)
        my_boosting_trees = TreeConstructor.build_big_boostingtree_with_dataframe(
            data_points_training_df, possible_categories)
        # this function returns a dictionary containing the trained decision-trees having the categories as key.

        predicted_list, actual_verification = TreeConstructor.make_boosting_predictions_with_grades_in_df(
            my_boosting_trees, data_points_verification_df,
            possible_categories)
        #  this function returns two lists containing lists of grades in float. Predictions and Actual grades to compare
        #        for x in range(len(predicted_list)):
        #            print(predicted_list[x][0])
        #            print(actual_verification[x])
        pass_fail_result = pass_fail_boosting2(predicted_list,
                                               actual_verification)
        # here we calculate all data we need
        deviation = average_deviation_boosting2(predicted_list,
                                                actual_verification)
        total_avg_deviation += deviation[0]
        total_avg_deviation_both += deviation[1]
        total_true += sum([x[1] for x in pass_fail_result])
        total_prolog += sum([x[0][0] for x in pass_fail_result])
        total_haskell += sum([x[0][1] for x in pass_fail_result])
        #

        # we add all the parameters because at the end we will divide it by the total amount of runs
        if length_prediction_list != len(pass_fail_result):
            length_prediction_list = len(pass_fail_result)
        big_result_list += [
            predicted_list[x][0].tolist() + actual_verification[x]
            for x in range(len(predicted_list))
        ]
    df = DataFrame(big_result_list,
                   columns=[
                       "Predicted Prolog", "Predicted Haskell",
                       "Actual Prolog", "Actual Haskell"
                   ])
    return [
        total_true / amount_of_runs, total_prolog / amount_of_runs,
        total_haskell / amount_of_runs, total_avg_deviation / amount_of_runs,
        length_prediction_list, total_avg_deviation_both / amount_of_runs, df
    ]