def load_pytorch(path, task_per_cpu=20): # check if we already have it if path.with_suffix(path.suffix + '.csv'): df = pd.read_csv(path.with_suffix(path.suffix + '.csv')) df['timestamp'] = pd.to_timedelta(df.timestamp) return df path, numLines = cm.find_file(path) # find optimal chunk size numCPU = os.cpu_count() chunkSize = numLines // numCPU // task_per_cpu if chunkSize == 0: chunkSize = 1 # the process with cm.open_file(path) as f, mp.Pool(processes=numCPU) as p,\ tqdm(total=numLines, desc='Parsing {}'.format(path.name), unit='lines') as pb: def updater(log): pb.update() return log ilog = (updater(log) for log in p.imap_unordered(_process_line_paral, f, chunksize=chunkSize)) df = pd.DataFrame(log for log in ilog if log is not None) df.sort_values(by=['timestamp'], inplace=True) return df
def load_case(path): path, _ = cm.find_file(path) with cm.open_file(path) as f: df = pd.read_csv(f, header=None, sep=' ', names=['date', 'time', 'event', 'skip', 'Model'], parse_dates=[['date', 'time']]) df = df[['date_time', 'event', 'Model']] df['timestamp'] = df['date_time'] df = df.drop('date_time', axis=1) wls = df.pivot_table(values='timestamp', index=['Model'], columns='event', aggfunc='first').reset_index() for col in ['Started', 'Queued', 'Finished']: wls[col] = wls[col].str[:-1] wls[col] = pd.to_datetime(wls[col]) wls['queuing'] = wls.Started - wls.Queued wls['JCT'] = wls.Finished - wls.Queued # for convinent wls['No'] = pd.to_numeric(wls['Model'].str.rpartition('.')[2]) return wls
def parse_iterations(path): path, _ = cm.find_file(path) iterations = [] with cm.open_file(path) as f: for line in f: line = line.rstrip('\n') m = ptn_iter.match(line) if m: iterations.append(m.groupdict()) assert len(iterations) > 0 fake = {} fake.update(iterations[-1]) fake['Speed'] = 0 fake['timestamp'] = (pd.to_datetime(fake['timestamp']) + pd.Timedelta(1, 'us')).strftime('%Y-%m-%d %H:%M:%S.%f') iterations.append(fake) fake = {} fake.update(iterations[0]) fake['Speed'] = 0 fake['timestamp'] = (pd.to_datetime(fake['timestamp']) - pd.Timedelta(1, 'us')).strftime('%Y-%m-%d %H:%M:%S.%f') iterations[:0] = [fake] df = pd.DataFrame(iterations) df['timestamp'] = pd.to_datetime(df['timestamp']) df['Speed'] = pd.to_numeric(df['Speed']) df['Step'] = pd.to_numeric(df.Step) df['Duration'] = pd.to_numeric(df.Duration) df = df.sort_values('timestamp') # calculate a cumulative speed # get batch size batch_size = int(path.name.partition('.')[0].partition('_')[-1]) cumspeed = [] start = df['timestamp'].iloc[0] - pd.Timedelta(df.Duration.iloc[0], 's') for idx, row in df.iterrows(): images = batch_size * (idx + 1) dur = (row['timestamp'] - start) / pd.Timedelta(1, 's') cumspeed.append(images/dur) df['CumSpeed'] = cumspeed return df