Ejemplo n.º 1
0
def display_usage_stats():
    trip_data = pd.read_csv('201309_trip_summary.csv')
    usage_stats(trip_data)
    usage_plot(trip_data, 'subscription_type')
    usage_plot(trip_data,
               'duration', ['duration < 60'],
               boundary=0,
               bin_width=5)
Ejemplo n.º 2
0
def display_new_usage_stats():
    trip_data = pd.read_csv('babs_y1_y2_summary.csv')
    usage_stats(trip_data)
    usage_plot(trip_data,
               'start_month', ['start_month < 12'],
               boundary=1,
               bin_width=1)
    usage_plot(trip_data,
               'start_hour', ['start_hour < 23'],
               boundary=0,
               bin_width=1)
Ejemplo n.º 3
0
def question_3(data):
    # This function will check that the sample data has been wrangled properly.
    n_correct = 0

    # Check that there are a correct number of lines in the dataset.
    if data.shape[0] != 27345:
        print("Expected 27,345 data points, found only {:d}.".format(
            data.shape[0]))
    else:
        n_correct += 1

    # Check that the durations have been converted into terms of minutes.
    data_duration_stats = usage_stats(data, verbose=False)
    expected_duration_stats = np.array([6.816667, 10.716667, 17.28333])
    if not np.allclose(data_duration_stats, expected_duration_stats):
        print("Duration statistics do not match expected units (minutes).")
        if np.allclose(data_duration_stats, np.array([409, 643, 1037])):
            print("  It looks like the units are still in terms of seconds.")
        elif np.allclose(data_duration_stats, np.array([24520, 38580, 62220])):
            print(
                "  It looks like you might have used the wrong operator in your conversion."
            )
        print("  Remember that there are 60 seconds in each minute.")
    else:
        n_correct += 1

    # Check that the timestamps have been wrangled properly.
    expected_time_vals = {
        'start_month': [25243, 2102],
        'start_hour': [
            2851, 2291, 2219, 2171, 2131, 1976, 1833, 1799, 1791, 1644, 1359,
            1269, 1071, 797, 644, 440, 394, 276, 153, 65, 55, 45, 42, 29
        ],
        'weekday': [4712, 4493, 4370, 3860, 3637, 3138, 3135]
    }

    for column in expected_time_vals.keys():
        col_data = data[column].value_counts().values
        n_values = len(col_data)
        n_values_expected = len(expected_time_vals[column])
        if not n_values == n_values_expected:
            print("Wrong number of unique values found for column: {}".format(
                column))
            print("  {:d} unique values expected; {:d} values found.".format(
                n_values_expected, n_values))
        elif not np.array_equal(col_data, expected_time_vals[column]):
            expected_max = expected_time_vals[column][0]
            expected_min = expected_time_vals[column][-1]
            print("Unexpected count of values for column: {}".format(column))
            print(
                "  Most common value expected {:d} data points; {:d} trips found."
                .format(expected_max, col_data[0]))
            print(
                "  Least common value expected {:d} data points; {:d} trips found."
                .format(expected_min, col_data[-1]))
        else:
            n_correct += 1

    if n_correct == len(expected_time_vals.keys()) + 2:
        print("All counts are as expected!")
Ejemplo n.º 4
0
def question_3(data):
    """
    This function will check that the sample data has been wrangled properly.
    """

    n_correct = 0

    # Check that there are a correct number of lines in the dataset.
    if data.shape[0] != 27345:
        print("Eram esperados 27,345 pontos de dados, Encontrados apenas {:d}.".format(data.shape[0]))
    else:
        n_correct += 1

    # Check that the durations have been converted into terms of minutes.
    data_duration_stats = usage_stats(data, verbose = False)
    expected_duration_stats = np.array([6.816667, 10.716667, 17.28333])
    if not np.allclose(data_duration_stats, expected_duration_stats):
        print("Os dados de duração não batem com o esperado (em minutos).")
        if np.allclose(data_duration_stats, np.array([409, 643, 1037])):
            print("  Parece que as unidades ainda se encontram em segundos.")
        elif np.allclose(data_duration_stats, np.array([24520, 38580, 62220])):
            print("  Parece que você usou o operador matemático errado para a sua conversão.")
        print("  Lembre-se que existem 60 segundos em um minuto.")
    else:
        n_correct += 1

    # Check that the timestamps have been wrangled properly.
    expected_time_vals = {'start_month': [25243, 2102],
                          'start_hour': [2851, 2291, 2219, 2171, 2131, 1976,
                                      1833, 1799, 1791, 1644, 1359, 1269,
                                      1071,  797,  644,  440,  394,  276,
                                       153,   65,   55,   45,   42,   29],
                          'weekday': [4712, 4493, 4370, 3860, 3637, 3138, 3135]}

    for column in expected_time_vals.keys():
        col_data = data[column].value_counts().values
        n_values = len(col_data)
        n_values_expected = len(expected_time_vals[column])
        if not n_values == n_values_expected:
            print("Número errado de valores únicos encontrados para a coluna : {}".format(column))
            print("  {:d} valores únicos esperados; {:d} valores encontrados.".format(n_values_expected, n_values))
        elif not np.array_equal(col_data, expected_time_vals[column]):
            expected_max = expected_time_vals[column][0]
            expected_min = expected_time_vals[column][-1]
            print("Contagem de valores erradas para a coluna: {}".format(column))
            print("  Valor mais comum esperado {:d} pontos de dados; {:d} viagens encontradas.".format(expected_max, col_data[0]))
            print("  Valor menos esperado {:d} pontos de dados; {:d} viagens enconrtadas.".format(expected_min, col_data[-1]))
        else:
            n_correct += 1

    if n_correct == len(expected_time_vals.keys()) + 2:
        print("Todas as contagens estão como esperadas.")
tiptrip = trip_data['duration'].quantile(.25)
tiptripover = trip_data['duration'].quantile(.75)

print('Existem {} pontos no conjunto de dados'.format(counttrip))
print('A duração média das viagens foi de {:.2f} minutos'.format(meantrip))
print('A mediana das durações das viagens foi de {:.2f} minutos'.format(
    mediantrip))
print(
    '25% das viagens foram mais curtas do que {:.2f} minutos'.format(tiptrip))
print('25% das viagens foram mais compridas do que {:.2f} minutos'.format(
    tiptripover))

# In[14]:

# execute este campo para verificar os seu processamento acima.
usage_stats(trip_data)

# Você deve ver que há mais de 27.000 viagens no primeiro mês e que a duração média da viagem é maior do que a duração mediana da viagem (o ponto em que 50% das viagens são mais curtas e 50% são mais longas). Na verdade, a média é maior que as durações de 75% das viagens mais curtas. Isso será interessante para ver mais adiante.
#
# Vamos começar a ver como essas viagens são divididas por tipo de inscrição. Uma maneira fácil de construir uma intuição sobre os dados é traçá-los.
#
# Lembre-se que o Pandas possui maneiras de plotar os gráficos diretamente de um DataFrame. Para cada tipo de dados/análises se pode usar um tipo diferente de gráfico mais apropriado para a análise que se está fazendo.
#
# Na caixa abaixo, faça um gráfico de viagens x tipo de subscrição do tipo barras.

# In[15]:

# TODO: plote um gráfico de barras que mostre quantidade de viagens por subscription_type
# lembrando que quando o comando .plot é usado, se pode escolher o tipo de gráfico usando
# o parâmetro kind. Ex: plot(kind='bar')
data = trip_data.groupby('subscription_type', as_index=False).count()
# Verify the dataframe by counting data points matching each of the time features.
question_3(sample_data)


# > **Tip**: If you save a jupyter Notebook, the output from running code blocks will also be saved. However, the state of your workspace will be reset once a new session is started. Make sure that you run all of the necessary code blocks from your previous session to reestablish variables and functions before picking up where you last left off.
# 
# ## Exploratory Data Analysis
# 
# Now that you have some data saved to a file, let's look at some initial trends in the data. Some code has already been written for you in the `babs_visualizations.py` script to help summarize and visualize the data; this has been imported as the functions `usage_stats()` and `usage_plot()`. In this section we'll walk through some of the things you can do with the functions, and you'll use the functions for yourself in the last part of the project. First, run the following cell to load the data, then use the `usage_stats()` function to see the total number of trips made in the first month of operations, along with some statistics regarding how long trips took.

# In[14]:

trip_data = pd.read_csv('201309_trip_summary.csv')

usage_stats(trip_data)


# You should see that there are over 27,000 trips in the first month, and that the average trip duration is larger than the median trip duration (the point where 50% of trips are shorter, and 50% are longer). In fact, the mean is larger than the 75% shortest durations. This will be interesting to look at later on.
# 
# Let's start looking at how those trips are divided by subscription type. One easy way to build an intuition about the data is to plot it. We'll use the `usage_plot()` function for this. The second argument of the function allows us to count up the trips across a selected variable, displaying the information in a plot. The expression below will show how many customer and how many subscriber trips were made. Try it out!

# In[15]:

usage_plot(trip_data, 'subscription_type')


# Seems like there's about 50% more trips made by subscribers in the first month than customers. Let's try a different variable now. What does the distribution of trip durations look like?

# In[16]:
Ejemplo n.º 7
0
def question_3(data):
    """
    This function will check that the sample data has been wrangled properly.
    """

    n_correct = 0

    # Check that there are a correct number of lines in the dataset.
    if data.shape[0] != 27345:
        print(
            "Eram esperados 27,345 pontos de dados, Encontrados apenas {:d}.".
            format(data.shape[0]))
    else:
        n_correct += 1

    # Check that the durations have been converted into terms of minutes.
    data_duration_stats = usage_stats(data, verbose=False)
    expected_duration_stats = np.array([6.816667, 10.716667, 17.28333])
    if not np.allclose(data_duration_stats, expected_duration_stats):
        print("Os dados de duração não batem com o esperado (em minutos).")
        if np.allclose(data_duration_stats, np.array([409, 643, 1037])):
            print("  Parece que as unidades ainda se encontram em segundos.")
        elif np.allclose(data_duration_stats, np.array([24520, 38580, 62220])):
            print(
                "  Parece que você usou o operador matemático errado para a sua conversão."
            )
        print("  Lembre-se que existem 60 segundos em um minuto.")
    else:
        n_correct += 1

    # Check that the timestamps have been wrangled properly.
    expected_time_vals = {
        'start_month': [25243, 2102],
        'start_hour': [
            2851, 2291, 2219, 2171, 2131, 1976, 1833, 1799, 1791, 1644, 1359,
            1269, 1071, 797, 644, 440, 394, 276, 153, 65, 55, 45, 42, 29
        ],
        'weekday': [4712, 4493, 4370, 3860, 3637, 3138, 3135]
    }

    for column in expected_time_vals.keys():
        col_data = data[column].value_counts().values
        n_values = len(col_data)
        n_values_expected = len(expected_time_vals[column])
        if not n_values == n_values_expected:
            print(
                "Número errado de valores únicos encontrados para a coluna : {}"
                .format(column))
            print("  {:d} valores únicos esperados; {:d} valores encontrados.".
                  format(n_values_expected, n_values))
        elif not np.array_equal(col_data, expected_time_vals[column]):
            expected_max = expected_time_vals[column][0]
            expected_min = expected_time_vals[column][-1]
            print(
                "Contagem de valores erradas para a coluna: {}".format(column))
            print(
                "  Valor mais comum esperado {:d} pontos de dados; {:d} viagens encontradas."
                .format(expected_max, col_data[0]))
            print(
                "  Valor menos esperado {:d} pontos de dados; {:d} viagens enconrtadas."
                .format(expected_min, col_data[-1]))
        else:
            n_correct += 1

    if n_correct == len(expected_time_vals.keys()) + 2:
        print("Todas as contagens estão como esperadas.")