def make_file(input_file, output_file): dtypes = csv_loader.get_dtypes() input_df = dd.read_csv(input_file, dtype=dtypes).compute() timer.time("started") do_it_all(input_df) input_df.to_csv(output_file, float_format='%.6f', index=False)
def make_file(input_file, output_file): dtypes = csv_loader.get_dtypes() input_df = dd.read_csv(input_file, dtype=dtypes).repartition(npartitions=32) timer.time("load csv") get_time(input_df) timer.time("got time") input_df = input_df.compute() timer.time("got pandas dataframe") with futures.ThreadPoolExecutor(max_workers=16) as executor: future_list = list() future_list.append(executor.submit(get_last_try, input_df)) # future_list.extend(executor.submit(get_first_appear_hour, input_df)) future_list.extend(submit_tasks(input_df, executor)) timer.time("done executor") for one_future in future_list: list_of_tuples = one_future.result() for results in list_of_tuples: col_name, series = results print(col_name) input_df[col_name] = series timer.time("done fitting to df") input_df.to_csv(output_file, float_format='%.6f', index=False) timer.time("done output")
def make_file(input_file, output_file, num_rows=None): dtypes = csv_loader.get_dtypes() if num_rows is None: input_df = pd.read_csv(input_file, dtype=dtypes) else: input_df = pd.read_csv(input_file, nrows=num_rows, dtype=dtypes) print(input_df.info()) do_it_all(input_df) print(input_df.info()) output_filename = os.path.join(OUTPUT_DIR, "train_day3_featured.csv") input_df.to_csv(output_file, float_format='%.6f', index=False)
import os, sys ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) APP_ROOT = os.path.join(ROOT, "talkingdata") INPUT_DIR = os.path.join(APP_ROOT, "input") OUTPUT_DIR = os.path.join(APP_ROOT, "output") TRAIN_DATA = os.path.join(INPUT_DIR, "test_old.csv") OUTPUT_DATA = os.path.join(OUTPUT_DIR, "channel_eda_test.csv") import pandas as pd import numpy as np from dask import dataframe as dd from talkingdata.common import csv_loader print("started") dtypes = csv_loader.get_dtypes() df = dd.read_csv(TRAIN_DATA, dtype=dtypes).compute() grouped = df.groupby("channel")["ip"].count() output_df = grouped.reset_index() output_df.to_csv(OUTPUT_DATA, index=False)