コード例 #1
0
def parse_logs(raw_stats, files, delta_mod_t=3600):
    t0 = time.time()

    for fp in files:
        delta_t = time.time() - os.path.getmtime(fp)
        if os.path.basename(fp) in raw_stats:
            if delta_t > delta_mod_t:
                continue
            print(
                'Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec'
                .format(fp, delta_t, delta_mod_t))
        else:
            print('Processing: {}'.format(fp))

        c2 = {}
        ii = 0
        for line in open(fp, 'rb'):
            ii += 1
            if ii % 10000 == 0:
                print(ii)
            if not line.startswith(b'{"_label_cost'):
                continue

            data = ds_parse.json_cooked(line, do_devType=True)

            if data['skipLearn']:
                continue

            # extract date from ts
            d = str(data['ts'][:13], 'utf-8')
            dev = str(data['devType'], 'utf-8')

            if d not in c2:
                c2[d] = {}
            if dev not in c2[d]:
                c2[d][dev] = [0, 0, 0]
            if 'ips' not in c2:
                c2['ips'] = {}
            if d[:10] not in c2['ips']:
                c2['ips'][d[:10]] = [0, 0, 0, 0]

            c2[d][dev][1] += 1
            if data['a'] == 1:
                c2['ips'][d[:10]][1] += 1 / data['p']
            c2['ips'][d[:10]][3] += 1 / data['p'] / data['num_a']
            if data['o'] == 1:
                c2[d][dev][0] += 1
            if data['cost'] != b'0':
                r = -float(data['cost'])
                c2[d][dev][2] += r
                if data['a'] == 1:
                    c2['ips'][d[:10]][0] += r / data['p']
                c2['ips'][d[:10]][2] += r / data['p'] / data['num_a']

        raw_stats[os.path.basename(fp)] = c2

    print('Log reading time:', time.time() - t0)
コード例 #2
0
    def get_metadata(self, local_log_path):
        summary_path = local_log_path + '.summary'

        for x in open(local_log_path, 'rb'):
            if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x, do_decode=True)
                with open(summary_path, 'a') as f:
                    f.write(json.dumps(data) + '\n')
        os.remove(local_log_path)
        os.rename(summary_path, local_log_path)
コード例 #3
0
ファイル: Visualization.py プロジェクト: oldslowfatstu/mwt-ds
def parse_logs(raw_stats, files, delta_mod_t=3600):
    t0 = time.time()

    for fp in files:
        delta_t = time.time() - os.path.getmtime(fp)
        if os.path.basename(fp) in raw_stats:
            if delta_t > delta_mod_t:
                continue
            print(
                'Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec'
                .format(fp, delta_t, delta_mod_t))
        else:
            print('Processing: {}'.format(fp))

        c2 = {}
        ii = 0
        for line in open(fp, 'rb'):
            ii += 1
            if ii % 10000 == 0:
                print(ii)
            if not line.startswith(b'{"_label_cost'):
                continue

            ei, r, o, ts, p, a, num_a, dev = ds_parse.json_cooked(
                line, do_devType=True)

            # extract date from ts
            d = str(ts[:13], 'utf-8')
            dev = str(dev, 'utf-8')

            if d not in c2:
                c2[d] = {}
            if dev not in c2[d]:
                c2[d][dev] = [0, 0, 0]
            if 'ips' not in c2:
                c2['ips'] = {}
            if d[:10] not in c2['ips']:
                c2['ips'][d[:10]] = [0, 0, 0, 0]

            c2[d][dev][1] += 1
            if a == 1:
                c2['ips'][d[:10]][1] += 1 / p
            c2['ips'][d[:10]][3] += 1 / p / num_a
            if o == 1:
                c2[d][dev][0] += 1
            if r != b'0':
                r = float(r)
                c2[d][dev][2] -= r
                if a == 1:
                    c2['ips'][d[:10]][0] -= r / p
                c2['ips'][d[:10]][2] -= r / p / num_a

        raw_stats[os.path.basename(fp)] = c2

    print('Log reading time:', time.time() - t0)
コード例 #4
0
ファイル: Visualization.py プロジェクト: Microsoft/mwt-ds
def parse_logs(raw_stats, files, delta_mod_t=3600):
    t0 = time.time()
    
    for fp in files:
        delta_t = time.time()-os.path.getmtime(fp)
        if os.path.basename(fp) in raw_stats:
            if delta_t > delta_mod_t:
                continue
            print('Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec'.format(fp,delta_t,delta_mod_t))
        else:
            print('Processing: {}'.format(fp))
        
        c2 = {}
        ii = 0
        for line in open(fp, 'rb'):
            ii += 1
            if ii % 10000 == 0:
                print(ii)
            if not line.startswith(b'{"_label_cost'):
                continue
            
            data = ds_parse.json_cooked(line, do_devType=True)
            
            if data['skipLearn']:
                continue
            
            # extract date from ts
            d = str(data['ts'][:13], 'utf-8')
            dev = str(data['devType'], 'utf-8')
            
            if d not in c2:
                c2[d] = {}
            if dev not in c2[d]:
                c2[d][dev] = [0,0,0]
            if 'ips' not in c2:
                c2['ips'] = {}
            if d[:10] not in c2['ips']:
                c2['ips'][d[:10]] = [0,0,0,0]
                
            c2[d][dev][1] += 1
            if data['a'] == 1:
                c2['ips'][d[:10]][1] += 1/data['p']
            c2['ips'][d[:10]][3] += 1/data['p']/data['num_a']
            if data['o'] == 1:
                c2[d][dev][0] += 1
            if data['cost'] != b'0':
                r = -float(data['cost'])
                c2[d][dev][2] += r
                if data['a'] == 1:
                    c2['ips'][d[:10]][0] += r/data['p']
                c2['ips'][d[:10]][2] += r/data['p']/data['num_a']

        raw_stats[os.path.basename(fp)] = c2

    print('Log reading time:', time.time()-t0)
コード例 #5
0
ファイル: Visualization.py プロジェクト: sidsen/mwt-ds
def parse_logs(raw_stats, files, delta_mod_t=3600):
    t0 = time.time()

    for fp in files:
        delta_t = time.time() - os.path.getmtime(fp)
        if os.path.basename(fp) in raw_stats and delta_t > delta_mod_t:
            continue
        print(
            'Processing: {} - Last modified: {:.1f} sec ago < delta_mod_t={} sec'
            .format(fp, delta_t, delta_mod_t))

        c2 = {}
        ii = 0
        for line in open(fp, encoding="utf8"):
            ii += 1
            if ii % 10000 == 0:
                print(ii)
            if 'Timestamp' not in line or '_label_cost' not in line:
                continue

            try:
                ei, r, ts, p, a, num_a, dev = ds_parse.json_cooked(
                    line, do_devType=True)

                # extract date from ts
                d = ts[:13]

                if d not in c2:
                    c2[d] = {}
                if dev not in c2[d]:
                    c2[d][dev] = [0, 0, 0]
                if 'ips' not in c2:
                    c2['ips'] = {}
                if d[:10] not in c2['ips']:
                    c2['ips'][d[:10]] = [0, 0]

                c2[d][dev][1] += 1
                c2['ips'][d[:10]][1] += 1
                if r != '0':
                    r = float(r)
                    c2[d][dev][0] += 1
                    c2[d][dev][2] -= r
                    if a == 1:
                        c2['ips'][d[:10]][0] -= r / p

            except Exception as e:
                print('error: {0}'.format(e))

        raw_stats[os.path.basename(fp)] = c2

    print('Log reading time:', time.time() - t0)
コード例 #6
0
def update(files, dt_str=13):
    fp_list = ds_parse.input_files_to_fp_list(files)
    l = []
    c_imp = collections.Counter()
    c_clk = collections.Counter()
    c_imp_all = collections.Counter()
    for fp in fp_list:
        bytes_count = 0
        tot_bytes = os.path.getsize(fp)
        for i, x in enumerate(
                gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')):
            bytes_count += len(x)
            if (i + 1) % 1000 == 0:
                if fp.endswith('.gz'):
                    ds_parse.update_progress(i + 1, prefix=fp + ' - ')
                else:
                    ds_parse.update_progress(bytes_count, tot_bytes,
                                             fp + ' - ')

            if x.startswith(b'{"_label') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x)
                if data is None:
                    continue

                c_imp_all.update([data['ts'][:dt_str]])
                if not data['skipLearn']:
                    c_imp.update([data['ts'][:dt_str]])
                    l.append((data, x.strip()))
                    if float(data['cost']) < 0:
                        c_clk.update([data['ts'][:dt_str]])
        if fp.endswith('.gz'):
            ds_parse.update_progress(i + 1, prefix=fp + ' - ')
        else:
            ds_parse.update_progress(bytes_count, tot_bytes, fp + ' - ')
        print()

    ctr = []
    ts = []
    print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.')
    for x in c_imp_all:
        ctr.append(c_clk[x] / max(c_imp[x], 1))
        ts.append(x)
        print('{},{},{},{:.2%},{}'.format(x, c_clk[x], c_imp[x], ctr[-1],
                                          c_imp_all[x]))
    print()
    return ts, ctr, l
コード例 #7
0
def update(files, dt_str=13):
    fp_list = ds_parse.input_files_to_fp_list(files)
    l = []
    c_imp = collections.Counter()
    c_clk = collections.Counter()
    c_imp_all = collections.Counter()
    for fp in fp_list:
        bytes_count = 0
        tot_bytes = os.path.getsize(fp)
        for i,x in enumerate(gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')):
            bytes_count += len(x)
            if (i+1) % 1000 == 0:
                if fp.endswith('.gz'):
                    ds_parse.update_progress(i+1,prefix=fp+' - ')
                else:
                    ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ')
        
            if x.startswith(b'{"_label') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x)
                if data['a'] <= 0:
                    continue
                
                c_imp_all.update([data['ts'][:dt_str]])
                if not data['skipLearn']:
                    c_imp.update([data['ts'][:dt_str]])
                    l.append((data, x.strip()))
                    if float(data['cost']) < 0:
                        c_clk.update([data['ts'][:dt_str]])
        if fp.endswith('.gz'):
            ds_parse.update_progress(i+1,prefix=fp+' - ')
        else:
            ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ')
        print()
                    
    ctr = []
    ts = []
    print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.')
    for x in c_imp_all:
        ctr.append(c_clk[x]/max(c_imp[x],1))
        ts.append(x)
        print('{},{},{},{:.2%},{}'.format(x,c_clk[x],c_imp[x],ctr[-1],c_imp_all[x]))
    print()
    return ts,ctr,l
コード例 #8
0
def print_stats(local_fp,
                azure_path,
                verbose=False,
                plot_hist=False,
                hist_bin=100):

    t = time.time()

    gt = {}
    len_local_rank = 0
    dup_rank = 0
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    bytes_count = 0
    tot_bytes = os.path.getsize(local_fp)
    for i, x in enumerate(open(local_fp, encoding='utf-8')):
        bytes_count += len(x)
        if (i + 1) % 10000 == 0:
            ds_parse.update_progress(
                bytes_count, tot_bytes,
                'Loading Local file: {} - '.format(local_fp))
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                ei = ds_parse.local_rank(x)
                len_local_rank += 1
                if ei in gt:
                    dup_rank += 1
                else:
                    gt[ei] = {'i': len_local_rank}
            elif '/reward/' in x and 'content:' in x:
                ei, r = ds_parse.local_reward(x)
                local_rew.append((ei, r))
                gt[ei].setdefault('local_rew', []).append(r)
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')])
    ds_parse.update_progress(tot_bytes, tot_bytes,
                             'Loading Local file: {} - '.format(local_fp))

    print('\n\nLoading Azure files...')
    if os.path.isdir(azure_path):
        files = [
            azure_fp.path for azure_fp in scantree(azure_path)
            if azure_fp.name.endswith('.json')
        ]
    else:
        files = [azure_path]

    verbose_output = []

    ei_miss_local = 0
    azure_data = []
    for ii, azure_fp in enumerate(files):
        bytes_count = 0
        tot_bytes = os.path.getsize(azure_fp)
        for i, x in enumerate(
                gzip.open(azure_fp, 'rb') if azure_fp.
                endswith('.gz') else open(azure_fp, 'rb')):
            bytes_count += len(x)
            if (i + 1) % 10000 == 0:
                if azure_fp.endswith('.gz'):
                    ds_parse.update_progress(i + 1,
                                             prefix='File {}/{}: {} - '.format(
                                                 ii + 1, len(files), azure_fp))
                else:
                    ds_parse.update_progress(
                        bytes_count, tot_bytes,
                        'File {}/{}: {} - '.format(ii + 1, len(files),
                                                   azure_fp))

            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                ei = str(data['ei'], 'utf-8')
                c = str(data['cost'], 'utf-8')
                azure_data.append((ei, c))
                if ei not in gt:
                    ei_miss_local += 1
                    if verbose:
                        verbose_output.append(
                            'Idx: {} - EventId: {} - Ranking missing from Local'
                            .format(len(azure_data), ei))
                else:
                    gt[ei].setdefault('azure_data', []).append((c, data['ts']))
        if azure_fp.endswith('.gz'):
            ds_parse.update_progress(i + 1,
                                     prefix='File {}/{}: {} - '.format(
                                         ii + 1, len(files), azure_fp))
        else:
            ds_parse.update_progress(
                bytes_count, tot_bytes,
                'File {}/{}: {} - '.format(ii + 1, len(files), azure_fp))
        print()
    print()

    dup_azure_counter = collections.Counter()
    dup_rew_counter = collections.Counter()
    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i, ei in enumerate(gt):
        if (i + 1) % 10000 == 0:
            ds_parse.update_progress(i + 1, len(gt),
                                     'Evaluating differences - ')
        if 'local_rew' in gt[ei]:
            if len(gt[ei]['local_rew']) > 1:
                dup_rew_counter.update([len(gt[ei]['local_rew'])])
                if verbose:
                    verbose_output.append(
                        'Idx: {} - EventId: {} - Duplicate in Reward: {}'.
                        format(gt[ei]['i'], ei, gt[ei]['local_rew']))
            else:
                if 'azure_data' in gt[ei]:
                    if len(gt[ei]['azure_data']) > 1:
                        dup_azure_counter.update([len(gt[ei]['azure_data'])])
                        if verbose:
                            verbose_output.append(
                                'Idx: {} - EventId: {} - Duplicate in Azure: {}'
                                .format(gt[ei]['i'], ei, gt[ei]['azure_data']))
                    else:
                        a = float(gt[ei]['local_rew'][0])
                        b = float(gt[ei]['azure_data'][0][0])
                        if abs(a + b) > max(1e-7 * max(abs(a), abs(b)), 1e-6):
                            err_rewards_idx.append(gt[ei]['i'])
                            if verbose:
                                verbose_output.append(
                                    'Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'
                                    .format(gt[ei]['i'], ei,
                                            gt[ei]['local_rew'][0],
                                            gt[ei]['azure_data'][0]))
                else:
                    no_events_idx.append(gt[ei]['i'])
                    if verbose:
                        verbose_output.append(
                            'Idx: {} - EventId: {} - Ranking missing from Azure'
                            .format(gt[ei]['i'], ei))
        else:
            no_rewards_idx.append(gt[ei]['i'])
            if verbose:
                verbose_output.append(
                    'Idx: {} - EventId: {} - Reward missing from local'.format(
                        gt[ei]['i'], ei))
    ds_parse.update_progress(i + 1, len(gt), 'Evaluating differences - ')
    print()

    for x in verbose_output:
        print(x)

    print('\nComputing summary stats...')
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {y[0]: y[1] for y in azure_data}

    dup_azure = sum((x - 1) * dup_azure_counter[x] for x in dup_azure_counter)
    dup_rew = sum((x - 1) * dup_rew_counter[x] for x in dup_rew_counter)
    if verbose:
        print('-----' * 10)
        print('Missing events indexes (1-based indexing)\n{}'.format(
            no_events_idx))
        print('-----' * 10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(
            no_rewards_idx))
        print('-----' * 10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(
            err_rewards_idx))
    print('-----' * 10)
    print('Events in local_rank: {} (Duplicates: {})'.format(
        len_local_rank, dup_rank))
    print('Events in local_rew: {} (Duplicates: {} - {})'.format(
        len(local_rew), dup_rew, dup_rew_counter))
    print('Events in azure_data: {} (Duplicates: {} - {})'.format(
        len(azure_data), dup_azure, dup_azure_counter))
    print('-----' * 10)
    print('Intersection local_rank/local_rew:',
          sum(1 for x in rew_dict if x in gt))
    print('Intersection local_rank/azure_data:',
          sum(1 for x in azure_dict if x in gt))
    print('Missing EventIds from local: {}'.format(ei_miss_local))
    print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),
                                                     len_local_rank),
              end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),
                                                     len_local_rank),
              end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----' * 10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----' * 10)
    print('Elapsed time: ', time.time() - t)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx,
                             hist_bin,
                             label='Wrong reward',
                             color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx', a)
            if no_events_idx:
                b = plt.hist(no_events_idx,
                             hist_bin,
                             label='No rank',
                             color='xkcd:blue')
                if verbose:
                    print('no_events_idx', b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx,
                             hist_bin,
                             label='No local reward',
                             color='xkcd:red')
                if verbose:
                    print('no_rewards_idx', c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')
コード例 #9
0
def compute_estimates(log_fp, cats_transformer=None):
    # Init estimators
    online = ips_snips.Estimator()
    baseline1 = ips_snips.Estimator()
    baselineR = ips_snips.Estimator()
    online_mle = mle.Estimator()
    baseline1_mle = mle.Estimator()
    baselineR_mle = mle.Estimator()
    online_cressieread = cressieread.Estimator()
    baseline1_cressieread = cressieread.Estimator()
    baselineR_cressieread = cressieread.Estimator()

    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i, x in enumerate(
            gzip.open(log_fp, 'rb') if log_fp.
            endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        if (i + 1) % 10000 == 0:
            if log_fp.endswith('.gz'):
                ds_parse.update_progress(i + 1)
            else:
                ds_parse.update_progress(bytes_count, tot_bytes)

        # parse dsjson file
        if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
            data = ds_parse.json_cooked(x)

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # Update estimators with tuple (p_log, r, p_pred)
            online.add_example(data['p'], r, data['p'])
            baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR.add_example(data['p'], r, 1 / data['num_a'])

            online_mle.add_example(data['p'], r, data['p'])
            baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR_mle.add_example(data['p'], r, 1 / data['num_a'])

            online_cressieread.add_example(data['p'], r, data['p'])
            baseline1_cressieread.add_example(data['p'], r,
                                              1 if data['a'] == 1 else 0)
            baselineR_cressieread.add_example(data['p'], r, 1 / data['num_a'])

            evts += 1

        if x.startswith(b'{"_label_ca":') and x.strip().endswith(b'}'):
            data = ds_parse.json_cooked_continuous_actions(x)
            if cats_transformer is None:
                raise RuntimeError(
                    "Not all of the required arguments for running with continuous actions have been provided."
                )
            # passing logged action as predicted action to transformer
            data = cats_transformer.transform(data, data['a'])
            # passing baseline action as predicted action to transformer
            data_baseline1 = cats_transformer.transform(
                data, cats_transformer.get_baseline1_prediction())

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # Update estimators with tuple (p_log, r, p_pred)
            online.add_example(data['p'], r, data['p'])
            baseline1.add_example(data['p'], r, data_baseline1['pred_p'])
            baselineR.add_example(data['p'], r,
                                  1.0 / cats_transformer.continuous_range)

            online_mle.add_example(data['p'], r, data['p'])
            baseline1_mle.add_example(data['p'], r, data_baseline1['pred_p'])
            baselineR_mle.add_example(data['p'], r,
                                      1.0 / cats_transformer.continuous_range)

            online_cressieread.add_example(data['p'], r, data['p'])
            baseline1_cressieread.add_example(data['p'], r,
                                              data_baseline1['pred_p'])
            baselineR_cressieread.add_example(
                data['p'], r, 1.0 / cats_transformer.continuous_range)

            evts += 1

    if log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(i + 1)
    else:
        len_text = ds_parse.update_progress(bytes_count, tot_bytes)

    print('\nProcessed {} events out of {} lines'.format(evts, i + 1))

    print('online_ips:', online.get_estimate('ips'))

    print('baseline1_ips:', baseline1.get_estimate('ips'))
    print('baseline1 gaussian ci:', baseline1.get_interval('gaussian'))
    print('baseline1 clopper pearson ci:',
          baseline1.get_interval('clopper-pearson'))

    print('baselineR_ips:', baselineR.get_estimate('ips'))
    print('baselineR gaussian ci:', baselineR.get_interval('gaussian'))
    print('baselineR clopper pearson ci:',
          baselineR.get_interval('clopper-pearson'))

    print('online_snips:', online.get_estimate('snips'))
    print('baseline1_snips:', baseline1.get_estimate('snips'))
    print('baselineR_snips:', baselineR.get_estimate('snips'))

    print('online_mle:', online_mle.get_estimate())
    print('baseline1_mle:', baseline1_mle.get_estimate())
    print('baselineR_mle:', baselineR_mle.get_estimate())

    print('online_cressieread:', online_cressieread.get_estimate())
    print('baseline1_cressieread:', baseline1_cressieread.get_estimate())
    print('baselineR_cressieread:', baselineR_cressieread.get_estimate())
コード例 #10
0
ファイル: RankRewardAnalyzer.py プロジェクト: gupchup/mwt-ds
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False):

    print('Computing statistics...')

    local_rank = []
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    for x in open(local_fp, encoding='utf-8'):
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                local_rank.append(ds_parse.local_rank(x))
            elif '/reward/' in x and 'content:' in x:
                local_rew.append(ds_parse.local_reward(x))
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')])

    if os.path.isdir(azure_path):
        files = [
            azure_fp.path for azure_fp in scantree(azure_path)
            if azure_fp.name.endswith('.json')
        ]
    else:
        files = [azure_path]

    azure_data = []
    for azure_fp in files:
        for x in open(azure_fp, 'rb'):
            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                azure_data.append([data['ei'], data['cost']])

    local_rank_set = set(local_rank)
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {str(y[0], 'utf-8'): str(y[1], 'utf-8') for y in azure_data}

    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i, x in enumerate(local_rank):
        if x in rew_dict:
            if x in azure_dict:
                if abs(1. + float(azure_dict[x]) / float(rew_dict[x])) > 1e-7:
                    if verbose:
                        print(
                            'Idx: {} - Error in reward: Local: {} Azure: {} - EventId: {}'
                            .format(i + 1, rew_dict[x], azure_dict[x], x))
                    err_rewards_idx.append(i + 1)
            else:
                no_events_idx.append(i + 1)
                if verbose:
                    print('Idx: {} - Ranking missing from Azure - EventId: {}'.
                          format(i + 1, x))
        else:
            no_rewards_idx.append(i + 1)
            if verbose:
                print(
                    'Idx: {} - Reward missing from local - EventId: {}'.format(
                        i + 1, x))

    dup_local = len(local_rew) - len(rew_dict)
    dup_azure = len(azure_data) - len(azure_dict)
    if verbose:
        print('-----' * 10)
        print('Missing events indexes (1-based indexing)\n{}'.format(
            no_events_idx))
        print('-----' * 10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(
            no_rewards_idx))
        print('-----' * 10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(
            err_rewards_idx))
        if dup_local > 0:
            print('-----' * 10)
            print('Duplicates in Local rewards')
            dup_analysis(local_rew)
        if dup_azure > 0:
            print('-----' * 10)
            print('Duplicates in Azure Storage')
            dup_analysis(azure_data)
    print('-----' * 10)
    print('Events in local_rank: {} (Duplicates: {})'.format(
        len(local_rank),
        len(local_rank) - len(local_rank_set)))
    print('Events in local_rew: {} (Duplicates: {})'.format(
        len(local_rew), dup_local))
    print('Events in azure_data: {} (Duplicates: {})'.format(
        len(azure_data), dup_azure))
    print('-----' * 10)
    print('Intersection local_rank/local_rew:',
          len(local_rank_set.intersection(rew_dict.keys())))
    print('Intersection local_rank/azure_data:',
          len(local_rank_set.intersection(azure_dict.keys())))
    print('Missing EventIds: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),
                                                     len(local_rank)),
              end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),
                                                     len(local_rank)),
              end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----' * 10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----' * 10)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx,
                             50,
                             label='Wrong reward',
                             color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx', a)
            if no_events_idx:
                b = plt.hist(no_events_idx,
                             50,
                             label='No rank',
                             color='xkcd:blue')
                if verbose:
                    print('no_events_idx', b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx,
                             50,
                             label='No local reward',
                             color='xkcd:red')
                if verbose:
                    print('no_rewards_idx', c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')
コード例 #11
0
ファイル: dashboard_utils.py プロジェクト: tajkar/mwt-ds
def create_stats(log_fp, dashboard_file, predictions_files=None):

    t0 = time.time()

    if predictions_files is None:
        print('Searching prediction files for log file: {}'.format(log_fp))
        predictions_files = []
        for fn in os.scandir(os.path.dirname(log_fp)):
            if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'):
                predictions_files.append(fn.path)

    # load predictions from predictions_files
    pred = {}
    for pred_fp in predictions_files:
        if os.path.isfile(pred_fp):
            name = pred_fp.split('.')[-2]   # check that policy name is encoded in file_name
            if name:
                pred[name] = [x.strip() for x in open(pred_fp) if x.strip()]
                print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp))
            else:
                print('Name is not valid - Skip: {}'.format(pred_fp))
        else:
            print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp))
            sys.exit()

    if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred):
        print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred]))
        sys.exit()

    d = {}
    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i,x in enumerate(gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        if (i+1) % 1000 == 0:
            if log_fp.endswith('.gz'):
                ds_parse.update_progress(i+1)
            else:
                ds_parse.update_progress(bytes_count,tot_bytes)

        if x.startswith(b'{"_label_cost":'):
            data = ds_parse.json_cooked(x)

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            ############################### Aggregates for each bin ######################################
            #
            # 'n':   IPS of numerator
            # 'N':   total number of samples in bin from log (IPS = n/N)
            # 'd':   IPS of denominator (SNIPS = n/d)
            # 'Ne':  number of samples in bin when off-policy agrees with log policy
            # 'c':   max abs. value of numerator's items (needed for Clopper-Pearson confidence intervals)
            # 'SoS': sum of squares of numerator's items (needed for Gaussian confidence intervals)
            #
            #################################################################################################

            # binning timestamp every 5 min
            ts_bin = get_ts_5min_bin(data['ts'])

            # initialize aggregates for ts_bin
            if ts_bin not in d:
                d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'N':0,'d':0},
                                                     'baseline1' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0},
                                                     'baselineRand' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}})
                for name in pred:
                    d[ts_bin][name] = {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}

            # update aggregates for online and baseline policies
            d[ts_bin]['online']['d'] += 1
            d[ts_bin]['online']['N'] += 1
            d[ts_bin]['baselineRand']['N'] += 1
            d[ts_bin]['baseline1']['N'] += 1

            d[ts_bin]['baselineRand']['Ne'] += 1
            d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a']
            if data['a'] == 1:
                d[ts_bin]['baseline1']['Ne'] += 1
                d[ts_bin]['baseline1']['d'] += 1/data['p']

            if r != 0:
                d[ts_bin]['online']['n'] += r
                d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a']
                d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a'])
                d[ts_bin]['baselineRand']['SoS'] += (r/data['p']/data['num_a'])**2
                if data['a'] == 1:
                    d[ts_bin]['baseline1']['n'] += r/data['p']
                    d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p'])
                    d[ts_bin]['baseline1']['SoS'] += (r/data['p'])**2                   

            # update aggregates for additional policies from predictions
            for name in pred:
                pred_prob = get_prediction_prob(data['a']-1, pred[name][evts])     # a-1: 0-index action
                d[ts_bin][name]['N'] += 1
                if pred_prob > 0:
                    p_over_p = pred_prob/data['p']
                    d[ts_bin][name]['d'] += p_over_p
                    d[ts_bin][name]['Ne'] += 1
                    if r != 0:
                        d[ts_bin][name]['n'] += r*p_over_p
                        d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p)
                        d[ts_bin][name]['SoS'] += (r*p_over_p)**2
            evts += 1
    if log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(i+1)
    else:
        len_text = ds_parse.update_progress(bytes_count,tot_bytes)
    sys.stdout.write("\r" + " "*len_text + "\r")
    sys.stdout.flush()

    print('Read {} lines - Processed {} events'.format(i+1,evts))
    if any(len(pred[name]) != evts for name in pred):
        print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts))
        sys.exit()

    output_dashboard_data(d, dashboard_file)
    
    print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))
コード例 #12
0
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=100):

    t = time.time()

    gt = {}
    len_local_rank = 0
    dup_rank = 0
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    bytes_count = 0
    tot_bytes = os.path.getsize(local_fp)
    for i,x in enumerate(open(local_fp, encoding='utf-8')):
        bytes_count += len(x)
        if (i+1) % 10000 == 0:
            ds_parse.update_progress(bytes_count,tot_bytes,'Loading Local file: {} - '.format(local_fp))
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                ei = ds_parse.local_rank(x)
                len_local_rank += 1
                if ei in gt:
                    dup_rank += 1
                else:
                    gt[ei] = {'i': len_local_rank}
            elif '/reward/' in x and 'content:' in x:
                ei,r = ds_parse.local_reward(x)
                local_rew.append((ei,r))
                gt[ei].setdefault('local_rew',[]).append(r)
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x,'status_code:','\t')])
    ds_parse.update_progress(tot_bytes,tot_bytes,'Loading Local file: {} - '.format(local_fp))

    print('\n\nLoading Azure files...')
    if os.path.isdir(azure_path):
        files = [azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json')]
    else:
        files = [azure_path]

    verbose_output = []

    ei_miss_local = 0
    azure_data = []
    for ii,azure_fp in enumerate(files):
        bytes_count = 0
        tot_bytes = os.path.getsize(azure_fp)
        for i,x in enumerate(gzip.open(azure_fp, 'rb') if azure_fp.endswith('.gz') else open(azure_fp, 'rb')):
            bytes_count += len(x)
            if (i+1) % 10000 == 0:
                if azure_fp.endswith('.gz'):
                    ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
                else:
                    ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp))

            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                ei = str(data['ei'], 'utf-8')
                c = str(data['cost'], 'utf-8')
                azure_data.append((ei, c))
                if ei not in gt:
                    ei_miss_local += 1
                    if verbose:
                        verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Local'.format(len(azure_data),ei))
                else:
                    gt[ei].setdefault('azure_data',[]).append((c, data['ts']))
        if azure_fp.endswith('.gz'):
            ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
        else:
            ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
        print()
    print()

    dup_azure_counter = collections.Counter()
    dup_rew_counter = collections.Counter()
    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i,ei in enumerate(gt):
        if (i+1) % 10000 == 0:
            ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ')
        if 'local_rew' in gt[ei]:
            if len(gt[ei]['local_rew']) > 1:
                dup_rew_counter.update([len(gt[ei]['local_rew'])])
                if verbose:
                    verbose_output.append('Idx: {} - EventId: {} - Duplicate in Reward: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew']))
            else:
                if 'azure_data' in gt[ei]:
                    if len(gt[ei]['azure_data']) > 1:
                        dup_azure_counter.update([len(gt[ei]['azure_data'])])
                        if verbose:
                            verbose_output.append('Idx: {} - EventId: {} - Duplicate in Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['azure_data']))
                    else:
                        a = float(gt[ei]['local_rew'][0])
                        b = float(gt[ei]['azure_data'][0][0])
                        if abs(a+b) > max(1e-7 * max(abs(a), abs(b)), 1e-6):
                            err_rewards_idx.append(gt[ei]['i'])
                            if verbose:
                                verbose_output.append('Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew'][0],gt[ei]['azure_data'][0]))
                else:
                    no_events_idx.append(gt[ei]['i'])
                    if verbose:
                        verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Azure'.format(gt[ei]['i'],ei))
        else:
            no_rewards_idx.append(gt[ei]['i'])
            if verbose:
                verbose_output.append('Idx: {} - EventId: {} - Reward missing from local'.format(gt[ei]['i'],ei))
    ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ')
    print()

    for x in verbose_output:
        print(x)

    print('\nComputing summary stats...')
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {y[0]: y[1] for y in azure_data}

    dup_azure = sum((x-1)*dup_azure_counter[x] for x in dup_azure_counter)
    dup_rew = sum((x-1)*dup_rew_counter[x] for x in dup_rew_counter)
    if verbose:
        print('-----'*10)
        print('Missing events indexes (1-based indexing)\n{}'.format(no_events_idx))
        print('-----'*10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(no_rewards_idx))
        print('-----'*10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(err_rewards_idx))
    print('-----'*10)
    print('Events in local_rank: {} (Duplicates: {})'.format(len_local_rank, dup_rank))
    print('Events in local_rew: {} (Duplicates: {} - {})'.format(len(local_rew), dup_rew, dup_rew_counter))
    print('Events in azure_data: {} (Duplicates: {} - {})'.format(len(azure_data), dup_azure, dup_azure_counter))
    print('-----'*10)
    print('Intersection local_rank/local_rew:',sum(1 for x in rew_dict if x in gt))
    print('Intersection local_rank/azure_data:',sum(1 for x in azure_dict if x in gt))
    print('Missing EventIds from local: {}'.format(ei_miss_local))
    print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),len_local_rank), end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),len_local_rank), end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----'*10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----'*10)
    print('Elapsed time: ',time.time()-t)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx, hist_bin, label='Wrong reward', color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx',a)
            if no_events_idx:
                b = plt.hist(no_events_idx, hist_bin, label='No rank', color='xkcd:blue')
                if verbose:
                    print('no_events_idx',b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx, hist_bin, label='No local reward', color='xkcd:red')
                if verbose:
                    print('no_rewards_idx',c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')
コード例 #13
0
def compute_estimates(log_fp):
    # Init estimators
    online = ips_snips.Estimator()
    baseline1 = ips_snips.Estimator()
    baselineR = ips_snips.Estimator()
    online_mle = mle.Estimator()
    baseline1_mle = mle.Estimator()
    baselineR_mle = mle.Estimator()

    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i, x in enumerate(
            gzip.open(log_fp, 'rb') if log_fp.
            endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        if (i + 1) % 10000 == 0:
            if log_fp.endswith('.gz'):
                ds_parse.update_progress(i + 1)
            else:
                ds_parse.update_progress(bytes_count, tot_bytes)

        # parse dsjson file
        if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
            data = ds_parse.json_cooked(x)

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # Update estimators with tuple (p_log, r, p_pred)
            online.add_example(data['p'], r, data['p'])
            baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR.add_example(data['p'], r, 1 / data['num_a'])

            online_mle.add_example(data['p'], r, data['p'])
            baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR_mle.add_example(data['p'], r, 1 / data['num_a'])

            evts += 1

    if log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(i + 1)
    else:
        len_text = ds_parse.update_progress(bytes_count, tot_bytes)

    print('\nProcessed {} events out of {} lines'.format(evts, i + 1))

    print('online_ips:', online.get_estimate('ips'))
    print('baseline1_ips:', baseline1.get_estimate('ips'))
    print('baselineR_ips:', baselineR.get_estimate('ips'))

    print('online_snips:', online.get_estimate('snips'))
    print('baseline1_snips:', baseline1.get_estimate('snips'))
    print('baselineR_snips:', baselineR.get_estimate('snips'))

    print('online_mle:', online_mle.get_estimate())
    print('baseline1_mle:', baseline1_mle.get_estimate())
    print('baselineR_mle:', baselineR_mle.get_estimate())
コード例 #14
0
def download_container(app_id, log_dir, start_date=None, end_date=None, overwrite_mode=0, dry_run=False, version=2, verbose=False, create_gzip_mode=-1, delta_mod_t=3600, max_connections=4, confirm=False):
    
    t_start = time.time()
    print('-----'*10)
    print('Current UTC time: {}'.format(datetime.datetime.now(datetime.timezone.utc)))
    print('Start Date: {}'.format(start_date))
    print('End Date: {}'.format(end_date))
    print('Overwrite mode: {}'.format(overwrite_mode))
    print('dry_run: {}'.format(dry_run))
    print('version: {}'.format(version))
    print('create_gzip_mode: {}'.format(create_gzip_mode))
    
    if not dry_run:
        os.makedirs(os.path.join(log_dir, app_id), exist_ok=True)
    
    # Get Azure Storage Authentication
    config = configparser.ConfigParser()
    config.read('ds.config')
    connection_string = config['AzureStorageAuthentication'].get(app_id, config['AzureStorageAuthentication']['$Default'])
    
    print('-----'*10)
    
    if version == 1: # using C# api for uncooked logs
        output_fp = os.path.join(log_dir, app_id, app_id+'_'+start_date.strftime("%Y-%m-%d")+'_'+end_date.strftime("%Y-%m-%d")+'.json')
        print('Destination: {}'.format(output_fp))
        do_download = True
        if os.path.isfile(output_fp):
            if overwrite_mode in {0, 3, 4}:
                print('Output file already exits. Not downloading'.format(output_fp))
                do_download = False
            elif overwrite_mode == 1 and input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) not in {'Y', 'y'}:
                do_download = False
                
        if do_download:
            if dry_run:
                print('--dry_run - Not downloading!')
            else:
                print('Downloading...', end='')
                try:
                    import requests
                    LogDownloaderURL = "https://cps-staging-exp-experimentation.azurewebsites.net/api/Log?account={ACCOUNT_NAME}&key={ACCOUNT_KEY}&start={START_DATE}&end={END_DATE}&container={CONTAINER}"
                    connection_string_dict = dict(x.split('=',1) for x in connection_string.split(';'))
                    if not connection_string_dict['AccountName'] or len(connection_string_dict['AccountKey']) != 88:
                        print("Error: Invalid Azure Storage ConnectionString.")
                        sys.exit()
                    url = LogDownloaderURL.format(ACCOUNT_NAME=connection_string_dict['AccountName'], ACCOUNT_KEY=connection_string_dict['AccountKey'].replace('+','%2b'), CONTAINER=app_id, START_DATE=start_date.strftime("%Y-%m-%d"), END_DATE=(end_date+datetime.timedelta(days=1)).strftime("%Y-%m-%d"))
                    r = requests.post(url)
                    open(output_fp, 'wb').write(r.content)
                    print(' Done!\n')
                except Exception as e:
                    print('Error: {}'.format(e))
        
    else: # using BlockBlobService python api for cooked logs
        try:
            print('Establishing Azure Storage BlockBlobService connection...')
            bbs = BlockBlobService(connection_string=connection_string)
            # List all blobs and download them one by one
            print('Getting blobs list...')
            blobs = bbs.list_blobs(app_id)
        except Exception as e:
            if e.args[0] == 'dictionary update sequence element #0 has length 1; 2 is required':
                print("Error: Invalid Azure Storage ConnectionString.")
            elif type(e.args[0]) == str and e.args[0].startswith('The specified container does not exist.'):
                print("Error: The specified container ({}) does not exist.".format(app_id))
            else:
                print("Error:\nType: {}\nArgs: {}".format(type(e).__name__, e.args))
            sys.exit()

        print('Iterating through blobs...\n')
        selected_fps = []
        for blob in blobs:
            if '/data/' not in blob.name:
                if verbose:
                    print('{} - Skip: Non-data blob\n'.format(blob.name))
                continue
            
            blob_day = datetime.datetime.strptime(blob.name.split('/data/', 1)[1].split('_', 1)[0], '%Y/%m/%d')
            if (start_date and blob_day < start_date) or (end_date and end_date < blob_day):
                if verbose:
                    print('{} - Skip: Outside of date range\n'.format(blob.name))
                continue

            try:
                bp = bbs.get_blob_properties(app_id, blob.name)

                if confirm:
                    if input("{} - Do you want to download [Y/n]? ".format(blob.name)) not in {'Y', 'y'}:
                        print()
                        continue

                fp = os.path.join(log_dir, app_id, blob.name.replace('/','_'))
                selected_fps.append(fp)
                if os.path.isfile(fp):
                    file_size = os.path.getsize(fp)
                    if overwrite_mode == 0:
                        if verbose:
                            print('{} - Skip: Output file already exits\n'.format(blob.name))
                        continue
                    elif overwrite_mode in {1, 3, 4}:
                        if file_size == bp.properties.content_length: # file size is the same, skip!
                            if verbose:
                                print('{} - Skip: Output file already exits with same size\n'.format(blob.name))
                            continue
                        print('Output file already exits: {}\nLocal size: {:.3f} MB\nAzure size: {:.3f} MB'.format(fp, file_size/(1024**2), bp.properties.content_length/(1024**2)))
                        if overwrite_mode in {3, 4} and file_size > bp.properties.content_length: # local file size is larger, skip with warning!
                            print('{} - Skip: Output file already exits with larger size\n'.format(blob.name))
                            continue
                        if overwrite_mode == 1 and input("Do you want to overwrite [Y/n]? ") not in {'Y', 'y'}:
                            print()
                            continue
                else:
                    file_size = None

                print('Processing: {} (size: {:.3f}MB - Last modified: {})'.format(blob.name, bp.properties.content_length/(1024**2), bp.properties.last_modified))
                # check if blob was modified in the last delta_mod_t sec
                if datetime.datetime.now(datetime.timezone.utc)-bp.properties.last_modified < datetime.timedelta(0, delta_mod_t):
                    if overwrite_mode < 2:
                        if input("Azure blob currently in use (modified in the last delta_mod_t={} sec). Do you want to download anyway [Y/n]? ".format(delta_mod_t)) not in {'Y', 'y'}:
                            print()
                            continue
                    elif overwrite_mode == 4:
                        print('Azure blob currently in use (modified in the last delta_mod_t={} sec). Skipping!\n'.format(delta_mod_t))
                        continue                        
                    max_connections = 1 # set max_connections to 1 to prevent crash if azure blob is modified during download

                if dry_run:
                    print('--dry_run - Not downloading!')
                else:
                    t0 = time.time()
                    if overwrite_mode in {3, 4} and file_size:
                        print('Check validity of remote file... ', end='')
                        temp_fp = fp + '.temp'
                        cmpsize = min(file_size,8*1024**2)
                        bbs.get_blob_to_path(app_id, blob.name, temp_fp, max_connections=max_connections, start_range=file_size-cmpsize, end_range=file_size-1)
                        if cmp_files(fp, temp_fp, -cmpsize):
                            print('Valid!')
                            print('Resume downloading to temp file with max_connections = {}...'.format(max_connections))
                            bbs.get_blob_to_path(app_id, blob.name, temp_fp, progress_callback=update_progress, max_connections=max_connections, start_range=os.path.getsize(fp))
                            download_time = time.time()-t0
                            download_size_MB = os.path.getsize(temp_fp)/(1024**2) # file size in MB
                            print('\nAppending to local file...')
                            with open(fp, 'ab') as f1, open(temp_fp, 'rb') as f2:
                                shutil.copyfileobj(f2, f1, length=100*1024**2)   # writing chunks of 100MB to avoid consuming memory
                            print('Appending completed. Deleting temp file...')
                            os.remove(temp_fp)
                        else:
                            os.remove(temp_fp)
                            print('Invalid! - Skip\n')
                            continue
                        print('Downloaded {:.3f} MB in {:.1f} sec. ({:.3f} MB/sec) - Total elapsed time: {:.1f} sec.\n'.format(download_size_MB, download_time, download_size_MB/download_time, time.time()-t0))
                    else:
                        print('Downloading with max_connections = {}...'.format(max_connections))
                        bbs.get_blob_to_path(app_id, blob.name, fp, progress_callback=update_progress, max_connections=max_connections)
                        download_time = time.time()-t0
                        download_size_MB = os.path.getsize(fp)/(1024**2) # file size in MB
                        print('\nDownloaded {:.3f} MB in {:.1f} sec. ({:.3f} MB/sec)\n'.format(download_size_MB, download_time, download_size_MB/download_time))
            except Exception as e:
                print('Error: {}'.format(e))

        if create_gzip_mode > -1:
            if selected_fps:
                selected_fps = [x for x in selected_fps if os.path.isfile(x)]
                if create_gzip_mode == 0:
                    models = {}
                    for fp in selected_fps:
                        models.setdefault(os.path.basename(fp).split('_data_',1)[0], []).append(fp)
                    for model in models:
                        models[model].sort(key=lambda x : list(map(int,x.split('_data_')[1].split('_')[:3])))
                        start_date = '-'.join(models[model][0].split('_data_')[1].split('_')[:3])
                        end_date = '-'.join(models[model][-1].split('_data_')[1].split('_')[:3])
                        output_fp = os.path.join(log_dir, app_id, app_id+'_'+model+'_data_'+start_date+'_'+end_date+'.json.gz')
                        print('Concat and zip files of LastConfigurationEditDate={} to: {}'.format(model, output_fp))
                        if os.path.isfile(output_fp) and input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) not in {'Y', 'y'}:
                            continue
                        if dry_run:
                            print('--dry_run - Not downloading!')
                        else:
                            with gzip.open(output_fp, 'wb') as f_out:
                                for fp in models[model]:
                                    print('Adding: {}'.format(fp))
                                    with open(fp, 'rb') as f_in:
                                        shutil.copyfileobj(f_in, f_out, length=100*1024**2)   # writing chunks of 100MB to avoid consuming memory
                elif create_gzip_mode == 1:
                    selected_fps.sort(key=lambda x : (list(map(int,x.split('_data_')[1].split('_')[:3])), -os.path.getsize(x), x))
                    selected_fps_merged = []
                    last_fp_date = None
                    for fp in selected_fps:
                        fp_date = datetime.datetime.strptime('_'.join(fp.split('_data_')[1].split('_')[:3]), "%Y_%m_%d")
                        if fp_date != last_fp_date:
                            selected_fps_merged.append(fp)
                            last_fp_date = fp_date

                    start_date = '-'.join(selected_fps_merged[0].split('_data_')[1].split('_')[:3])
                    end_date = '-'.join(selected_fps_merged[-1].split('_data_')[1].split('_')[:3])
                    output_fp = os.path.join(log_dir, app_id, app_id+'_merged_data_'+start_date+'_'+end_date+'.json.gz')
                    print('Merge and zip files of all LastConfigurationEditDate to: {}'.format(output_fp))
                    if not os.path.isfile(output_fp) or input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) in {'Y', 'y'}:
                        if dry_run:
                            for fp in selected_fps_merged:
                                print('Adding: {}'.format(fp))
                            print('--dry_run - Not downloading!')
                        else:
                            with gzip.open(output_fp, 'wb') as f_out:
                                for fp in selected_fps_merged:
                                    print('Adding: {}'.format(fp))
                                    with open(fp, 'rb') as f_in:
                                        shutil.copyfileobj(f_in, f_out, length=100*1024**2)   # writing chunks of 100MB to avoid consuming memory
                elif create_gzip_mode == 2:
                    selected_fps.sort(key=lambda x : (list(map(int,x.split('_data_')[1].split('_')[:3])), -os.path.getsize(x), x))
                    start_date = '-'.join(selected_fps[0].split('_data_')[1].split('_')[:3])
                    end_date = '-'.join(selected_fps[-1].split('_data_')[1].split('_')[:3])
                    output_fp = os.path.join(log_dir, app_id, app_id+'_deepmerged_data_'+start_date+'_'+end_date+'.json.gz')
                    print('Merge, unique, sort, and zip files of all LastConfigurationEditDate to: {}'.format(output_fp))
                    if not os.path.isfile(output_fp) or input('Output file already exits. Do you want to overwrite [Y/n]? '.format(output_fp)) in {'Y', 'y'}:
                        d = {}
                        for fn in selected_fps:
                            print('Parsing: {}'.format(fn), end='', flush=True)
                            if not dry_run:
                                for x in open(fn, 'rb'):
                                    if x.startswith(b'{"_label_cost') and x.strip().endswith(b'}'):     # reading only cooked lined
                                        data = ds_parse.json_cooked(x)
                                        if data['ei'] not in d or float(data['cost']) < d[data['ei']][1]: # taking line with best reward
                                            d[data['ei']] = (data['ts'], float(data['cost']), x)
                            print(' - len(d): {}'.format(len(d)))

                        print('Writing to output .gz file...')
                        if dry_run:
                            print('--dry_run - Not downloading!')
                        else:
                            with gzip.open(output_fp, 'wb') as f:
                                i = 0
                                for x in sorted(d.values(), key=lambda x : x[0]):                       # events are sorted by timestamp
                                    f.write(x[2])
                                    i += 1
                                    if i % 5000 == 0:
                                        update_progress(i, len(d))
                                update_progress(i, len(d))
                                print()
                else:
                    print('Unrecognized --create_gzip_mode: {}, skipping creating gzip files.'.format(create_gzip_mode))
            else:
                print('No file downloaded, skipping creating gzip files.')
                    
    print('Total elapsed time: {:.1f} sec.\n'.format(time.time()-t_start))
コード例 #15
0
ファイル: dashboard_utils.py プロジェクト: slahabar/mwt-ds
def create_stats(log_fp,
                 log_type='cb',
                 d=None,
                 predictions_files=None,
                 is_summary=False,
                 report_progress=True):

    t0 = time.time()
    if d is None:
        d = {}

    if predictions_files is None:
        print('Searching prediction files for log file: {}'.format(log_fp))
        predictions_files = []
        for fn in os.scandir(os.path.dirname(log_fp)):
            if fn.path.startswith(log_fp + '.') and fn.name.endswith('.pred'):
                predictions_files.append(fn.path)

    # load predictions from predictions_files
    pred = {}
    for pred_fp in predictions_files:
        if os.path.isfile(pred_fp):
            if is_summary:
                name = pred_fp.split('/')[-1].split('.')[-2]
            else:
                name = pred_fp.split('.')[
                    -2]  # check that policy name is encoded in file_name
            if name:
                if log_type == 'cb':
                    pred[name] = [
                        x.strip() for x in open(pred_fp) if x.strip()
                    ]
                elif log_type == 'ccb':
                    with open(pred_fp) as f:
                        pred[name] = []
                        slot = []
                        for x in f:
                            x = x.strip()
                            if x:
                                slot.append(x)
                            else:
                                pred[name].append(slot)
                                slot = []
                print('Loaded {} predictions from {}'.format(
                    len(pred[name]), pred_fp))
            else:
                print('Name is not valid - Skip: {}'.format(pred_fp))
        else:
            print('Error loading policy predictions. Pred file not found: {}'.
                  format(pred_fp))
            sys.exit()

    if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(
            len(pred[name]) for name in pred):
        print('Error: Prediction file length ({}) must be equal for all files'.
              format([len(pred[name]) for name in pred]))
        sys.exit()

    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i, x in enumerate(
            gzip.open(log_fp, 'rb') if log_fp.
            endswith('.gz') else open(log_fp, 'rb')):
        if report_progress:
            # display progress
            bytes_count += len(x)
            if (i + 1) % 1000 == 0:
                if log_fp.endswith('.gz'):
                    ds_parse.update_progress(i + 1)
                else:
                    ds_parse.update_progress(bytes_count, tot_bytes)

        data = None

        if log_type == 'ccb':
            if x.startswith(b'{"Timestamp"') and x.strip().endswith(b'}'):
                data = ds_parse.ccb_json_cooked(x)
                aggregates_ccb_data(data, pred, d, evts)

        elif log_type == 'cb':
            if is_summary:
                data = json.loads(x.decode("utf-8"))
            elif x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x, do_decode=True)

            # Skip wrongly formated lines or not activated lines
            if data is None or data['skipLearn']:
                continue

            aggregates_cb_data(data, pred, d, evts)
        evts += 1

    if report_progress:
        if log_fp.endswith('.gz'):
            len_text = ds_parse.update_progress(i + 1)
        else:
            len_text = ds_parse.update_progress(bytes_count, tot_bytes)
        sys.stdout.write("\r" + " " * len_text + "\r")
        sys.stdout.flush()

    print('Read {} lines - Processed {} events'.format(i + 1, evts))

    if any(len(pred[name]) != evts for name in pred):
        print(
            'Error: Prediction file length ({}) is different from number of events in log file ({})'
            .format([len(pred[name]) for name in pred], evts))
        sys.exit()
    print('Total Elapsed Time: {:.1f} sec.'.format(time.time() - t0))
    return d
コード例 #16
0
ファイル: dashboard_utils.py プロジェクト: gupchup/mwt-ds
def create_stats(log_fp, dashboard_file, predictions_files=None):

    t0 = time.time()

    if predictions_files is None:
        print('Searching prediction files for log file: {}'.format(log_fp))
        predictions_files = []
        for fn in os.scandir(os.path.dirname(log_fp)):
            if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'):
                predictions_files.append(fn.path)

    # load predictions from predictions_files
    pred = {}
    for pred_fp in predictions_files:
        if os.path.isfile(pred_fp):
            name = pred_fp.split('.')[-2]   # check that policy name is encoded in file_name
            if name:
                pred[name] = [x.strip() for x in open(pred_fp) if x.strip()]
                print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp))
            else:
                print('Name is not valid - Skip: {}'.format(pred_fp))
        else:
            print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp))
            sys.exit()

    if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred):
        print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred]))
        sys.exit()

    d = {}
    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    i = 0
    for x in (gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        i += 1
        if i % 5000 == 0:
            if log_fp.endswith('.gz'):
                if i % 20000 == 0:
                    print('.', end='', flush=True)
                    if i % 1000000 == 0:
                        print(' - Iter:',i)
            else:
                ds_parse.update_progress(bytes_count,tot_bytes)

        if x.startswith(b'{"_label_cost":'):
            data = ds_parse.json_cooked(x)
            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # binning time stamp every 5 min
            ts_bin = get_ts_5min_bin(data['ts'])

            # initialize aggregate for ts_bin
            if ts_bin not in d:
                d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'d':0},
                                                     'baseline1' : {'n':0.,'d':0.,'c':0.,'N':0},
                                                     'baselineRand' : {'n':0.,'d':0.,'c':0.,'N':0}})
                for name in pred:
                    d[ts_bin][name] = {'n':0.,'d':0.,'c':0.,'N':0}

            # online and baseline policies
            d[ts_bin]['online']['d'] += 1
            d[ts_bin]['baselineRand']['N'] += 1
            d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a']
            if data['a'] == 1:
                d[ts_bin]['baseline1']['N'] += 1
                d[ts_bin]['baseline1']['d'] += 1/data['p']

            if r != 0:
                d[ts_bin]['online']['n'] += r
                d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a']
                d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a'])
                if data['a'] == 1:
                    d[ts_bin]['baseline1']['n'] += r/data['p']
                    d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p'])

            # additional policies from predictions
            for name in pred:
                pred_prob = get_prediction_prob(data['a']-1, pred[name][evts])     # a-1: 0-index action
                
                if pred_prob > 0:
                    p_over_p = pred_prob/data['p']
                    d[ts_bin][name]['d'] += p_over_p
                    d[ts_bin][name]['N'] += 1
                    if r != 0:
                        d[ts_bin][name]['n'] += r*p_over_p
                        d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p)
            evts += 1
    if not log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(bytes_count,tot_bytes)
        sys.stdout.write("\r" + " "*len_text + "\r")
        sys.stdout.flush()

    print('Processed {} events'.format(evts))
    if any(len(pred[name]) != evts for name in pred):
        print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts))
        sys.exit()

    output_dashboard_data(d, dashboard_file)
    
    print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))