Ejemplo n.º 1
0
def load_pytorch(path, task_per_cpu=20):
    # check if we already have it
    if path.with_suffix(path.suffix + '.csv'):
        df = pd.read_csv(path.with_suffix(path.suffix + '.csv'))
        df['timestamp'] = pd.to_timedelta(df.timestamp)
        return df

    path, numLines = cm.find_file(path)

    # find optimal chunk size
    numCPU = os.cpu_count()
    chunkSize = numLines // numCPU // task_per_cpu
    if chunkSize == 0:
        chunkSize = 1

    # the process
    with cm.open_file(path) as f, mp.Pool(processes=numCPU) as p,\
            tqdm(total=numLines, desc='Parsing {}'.format(path.name), unit='lines') as pb:
        def updater(log):
            pb.update()
            return log

        ilog = (updater(log) for log in
                p.imap_unordered(_process_line_paral,
                                 f, chunksize=chunkSize))
        df = pd.DataFrame(log for log in ilog if log is not None)

    df.sort_values(by=['timestamp'], inplace=True)
    return df
Ejemplo n.º 2
0
def load_case(path):
    path, _ = cm.find_file(path)
    with cm.open_file(path) as f:
        df = pd.read_csv(f,
                         header=None,
                         sep=' ',
                         names=['date', 'time', 'event', 'skip', 'Model'],
                         parse_dates=[['date', 'time']])
    df = df[['date_time', 'event', 'Model']]
    df['timestamp'] = df['date_time']
    df = df.drop('date_time', axis=1)

    wls = df.pivot_table(values='timestamp',
                         index=['Model'],
                         columns='event',
                         aggfunc='first').reset_index()

    for col in ['Started', 'Queued', 'Finished']:
        wls[col] = wls[col].str[:-1]
        wls[col] = pd.to_datetime(wls[col])
    wls['queuing'] = wls.Started - wls.Queued
    wls['JCT'] = wls.Finished - wls.Queued

    # for convinent
    wls['No'] = pd.to_numeric(wls['Model'].str.rpartition('.')[2])

    return wls
Ejemplo n.º 3
0
def parse_iterations(path):
    path, _ = cm.find_file(path)
    iterations = []
    with cm.open_file(path) as f:
        for line in f:
            line = line.rstrip('\n')

            m = ptn_iter.match(line)
            if m:
                iterations.append(m.groupdict())
    assert len(iterations) > 0
    fake = {}
    fake.update(iterations[-1])
    fake['Speed'] = 0
    fake['timestamp'] = (pd.to_datetime(fake['timestamp']) + pd.Timedelta(1, 'us')).strftime('%Y-%m-%d %H:%M:%S.%f')
    iterations.append(fake)

    fake = {}
    fake.update(iterations[0])
    fake['Speed'] = 0
    fake['timestamp'] = (pd.to_datetime(fake['timestamp']) - pd.Timedelta(1, 'us')).strftime('%Y-%m-%d %H:%M:%S.%f')
    iterations[:0] = [fake]

    df = pd.DataFrame(iterations)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['Speed'] = pd.to_numeric(df['Speed'])
    df['Step'] = pd.to_numeric(df.Step)
    df['Duration'] = pd.to_numeric(df.Duration)

    df = df.sort_values('timestamp')
    # calculate a cumulative speed
    # get batch size
    batch_size = int(path.name.partition('.')[0].partition('_')[-1])

    cumspeed = []
    start = df['timestamp'].iloc[0] - pd.Timedelta(df.Duration.iloc[0], 's')
    for idx, row in df.iterrows():
        images = batch_size * (idx + 1)
        dur = (row['timestamp'] - start) / pd.Timedelta(1, 's')
        cumspeed.append(images/dur)
    df['CumSpeed'] = cumspeed
    return df