Beispiel #1
0
def main():
    # ``range()`` returns a list ...
    with memory_usage() as u:
        x = range(10 * 1000 * 1000)

    # ... so in our case the memory used by this process will be quite higher
    # now.
    print "range():  %s" % (u.rss,)

    # ``xrange()``, in contrast, returns an *iterator* ...
    with memory_usage() as u:
        x = xrange(10 * 1000 * 1000)

    # ... so now there should be almost no memory increase.
    print "xrange(): %s" % (u.rss,)
Beispiel #2
0
def main():
    for x, y in zip(xrange(1000), my_range(1000)):
        if x != y:
            raise Exception("Oops: %d is not %d" % (x, y))

    # ``my_range()`` returns an iterator, too ...
    with memory_usage() as u:
        x = my_range(10 * 1000 * 1000)

    # ... so now there should be almost no memory increase.
    print "my_range(): %s" % (u.rss,)
    def iterate_on_transactions(training=training, version=1):

        if version == 1:
            print("iterate on transactions")
            path_to_csv = path_to_data + 'transactions.csv'
        else:
            print("iterate on transactions_v2")
            path_to_csv = path_to_data + 'transactions_v2.csv'

        i = 0
        df_iter = pd.read_csv(path_to_csv, low_memory=False, iterator=True,
                              chunksize=transactions_chunk_size)
        print("starting iteration...")
        for transactions in df_iter:
            print("i=" + str(i))
            transactions = reformat_transactions(transactions)
            user_count = Counter(transactions['msno']).most_common()
            user_count = pd.DataFrame(user_count)
            user_count.columns = ['msno', 'current_number_of_transactions']
            user_count.set_index('msno', inplace=True)
            training = pd.merge(left=training, right=user_count, how='left', left_index=True, right_index=True)
            training['current_number_of_transactions'] = training.current_number_of_transactions.apply(
                lambda x: int(x) if pd.notnull(x) else 0)
            training["total_number_of_transactions"] += training["current_number_of_transactions"]
            training.drop(['current_number_of_transactions'], axis=1, inplace=True)

            print("memory usage of training: ")
            print(memory_usage(training))
            print("memory usage of transactions: ")
            print(memory_usage(transactions))
            i += 1

        print("end of iteration...")

        i = 0
        training.reset_index(inplace=True)
        training_copy = training.copy()

        df_iter = pd.read_csv(path_to_data + 'transactions.csv', low_memory=False, iterator=True,
                              chunksize=transactions_chunk_size)
        print("starting iteration, looking for most recent transaction...")
        for transactions in df_iter:
            print("i=" + str(i))

            reformat_transactions(transactions)
            recent_transactions = transactions.sort_values(['transaction_date']).groupby('msno').first()
            recent_transactions.reset_index(inplace=True)
            temp_training = pd.merge(left=training_copy, right=recent_transactions, how='right', on=['msno'], right_index=True)
            training = pd.concat((training, temp_training))
            training = training.sort_values(['transaction_date']).groupby('msno').first()

            print("memory usage of training: ")
            print(memory_usage(training))
            print("memory usage of transactions: ")
            print(memory_usage(transactions))

            i += 1

        del training_copy

        i = 0

        df_iter = pd.read_csv(path_to_data + 'transactions.csv', low_memory=False, iterator=True,
                              chunksize=transactions_chunk_size)

        training["price_per_day"] = training["actual_amount_paid"]/(training["payment_plan_days"]+0.01)

        print("starting iteration, looking for usual price per day...")
        for transactions in df_iter:
            print("i=" + str(i))
            i += 1

            transactions = reformat_transactions(transactions)
            transactions["current_price_per_day"] = transactions["actual_amount_paid"] / (transactions["payment_plan_days"] + 0.01)
            transactions = transactions.groupby("msno").sum()
            columns_to_keep = ["current_price_per_day"]
            transactions = transactions[columns_to_keep]

            training = pd.merge(left=training, right=transactions, how='left', left_index=True, right_index=True)

            training["usual_price_per_day"] += training["current_price_per_day"]
            training.drop(['current_price_per_day'], axis=1, inplace=True)

        return training