Esempio n. 1
0
def update(files, dt_str=13):
    fp_list = ds_parse.input_files_to_fp_list(files)
    l = []
    c_imp = collections.Counter()
    c_clk = collections.Counter()
    c_imp_all = collections.Counter()
    for fp in fp_list:
        bytes_count = 0
        tot_bytes = os.path.getsize(fp)
        for i, x in enumerate(
                gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')):
            bytes_count += len(x)
            if (i + 1) % 1000 == 0:
                if fp.endswith('.gz'):
                    ds_parse.update_progress(i + 1, prefix=fp + ' - ')
                else:
                    ds_parse.update_progress(bytes_count, tot_bytes,
                                             fp + ' - ')

            if x.startswith(b'{"_label') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x)
                if data is None:
                    continue

                c_imp_all.update([data['ts'][:dt_str]])
                if not data['skipLearn']:
                    c_imp.update([data['ts'][:dt_str]])
                    l.append((data, x.strip()))
                    if float(data['cost']) < 0:
                        c_clk.update([data['ts'][:dt_str]])
        if fp.endswith('.gz'):
            ds_parse.update_progress(i + 1, prefix=fp + ' - ')
        else:
            ds_parse.update_progress(bytes_count, tot_bytes, fp + ' - ')
        print()

    ctr = []
    ts = []
    print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.')
    for x in c_imp_all:
        ctr.append(c_clk[x] / max(c_imp[x], 1))
        ts.append(x)
        print('{},{},{},{:.2%},{}'.format(x, c_clk[x], c_imp[x], ctr[-1],
                                          c_imp_all[x]))
    print()
    return ts, ctr, l
Esempio n. 2
0
def update(files, dt_str=13):
    fp_list = ds_parse.input_files_to_fp_list(files)
    l = []
    c_imp = collections.Counter()
    c_clk = collections.Counter()
    c_imp_all = collections.Counter()
    for fp in fp_list:
        bytes_count = 0
        tot_bytes = os.path.getsize(fp)
        for i,x in enumerate(gzip.open(fp, 'rb') if fp.endswith('.gz') else open(fp, 'rb')):
            bytes_count += len(x)
            if (i+1) % 1000 == 0:
                if fp.endswith('.gz'):
                    ds_parse.update_progress(i+1,prefix=fp+' - ')
                else:
                    ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ')
        
            if x.startswith(b'{"_label') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x)
                if data['a'] <= 0:
                    continue
                
                c_imp_all.update([data['ts'][:dt_str]])
                if not data['skipLearn']:
                    c_imp.update([data['ts'][:dt_str]])
                    l.append((data, x.strip()))
                    if float(data['cost']) < 0:
                        c_clk.update([data['ts'][:dt_str]])
        if fp.endswith('.gz'):
            ds_parse.update_progress(i+1,prefix=fp+' - ')
        else:
            ds_parse.update_progress(bytes_count,tot_bytes,fp+' - ')
        print()
                    
    ctr = []
    ts = []
    print('Timestamp (UTC),Clicks,Activated Imp.,CTR,Total Imp.')
    for x in c_imp_all:
        ctr.append(c_clk[x]/max(c_imp[x],1))
        ts.append(x)
        print('{},{},{},{:.2%},{}'.format(x,c_clk[x],c_imp[x],ctr[-1],c_imp_all[x]))
    print()
    return ts,ctr,l
Esempio n. 3
0
def print_stats(local_fp,
                azure_path,
                verbose=False,
                plot_hist=False,
                hist_bin=100):

    t = time.time()

    gt = {}
    len_local_rank = 0
    dup_rank = 0
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    bytes_count = 0
    tot_bytes = os.path.getsize(local_fp)
    for i, x in enumerate(open(local_fp, encoding='utf-8')):
        bytes_count += len(x)
        if (i + 1) % 10000 == 0:
            ds_parse.update_progress(
                bytes_count, tot_bytes,
                'Loading Local file: {} - '.format(local_fp))
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                ei = ds_parse.local_rank(x)
                len_local_rank += 1
                if ei in gt:
                    dup_rank += 1
                else:
                    gt[ei] = {'i': len_local_rank}
            elif '/reward/' in x and 'content:' in x:
                ei, r = ds_parse.local_reward(x)
                local_rew.append((ei, r))
                gt[ei].setdefault('local_rew', []).append(r)
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x, 'status_code:', '\t')])
    ds_parse.update_progress(tot_bytes, tot_bytes,
                             'Loading Local file: {} - '.format(local_fp))

    print('\n\nLoading Azure files...')
    if os.path.isdir(azure_path):
        files = [
            azure_fp.path for azure_fp in scantree(azure_path)
            if azure_fp.name.endswith('.json')
        ]
    else:
        files = [azure_path]

    verbose_output = []

    ei_miss_local = 0
    azure_data = []
    for ii, azure_fp in enumerate(files):
        bytes_count = 0
        tot_bytes = os.path.getsize(azure_fp)
        for i, x in enumerate(
                gzip.open(azure_fp, 'rb') if azure_fp.
                endswith('.gz') else open(azure_fp, 'rb')):
            bytes_count += len(x)
            if (i + 1) % 10000 == 0:
                if azure_fp.endswith('.gz'):
                    ds_parse.update_progress(i + 1,
                                             prefix='File {}/{}: {} - '.format(
                                                 ii + 1, len(files), azure_fp))
                else:
                    ds_parse.update_progress(
                        bytes_count, tot_bytes,
                        'File {}/{}: {} - '.format(ii + 1, len(files),
                                                   azure_fp))

            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                ei = str(data['ei'], 'utf-8')
                c = str(data['cost'], 'utf-8')
                azure_data.append((ei, c))
                if ei not in gt:
                    ei_miss_local += 1
                    if verbose:
                        verbose_output.append(
                            'Idx: {} - EventId: {} - Ranking missing from Local'
                            .format(len(azure_data), ei))
                else:
                    gt[ei].setdefault('azure_data', []).append((c, data['ts']))
        if azure_fp.endswith('.gz'):
            ds_parse.update_progress(i + 1,
                                     prefix='File {}/{}: {} - '.format(
                                         ii + 1, len(files), azure_fp))
        else:
            ds_parse.update_progress(
                bytes_count, tot_bytes,
                'File {}/{}: {} - '.format(ii + 1, len(files), azure_fp))
        print()
    print()

    dup_azure_counter = collections.Counter()
    dup_rew_counter = collections.Counter()
    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i, ei in enumerate(gt):
        if (i + 1) % 10000 == 0:
            ds_parse.update_progress(i + 1, len(gt),
                                     'Evaluating differences - ')
        if 'local_rew' in gt[ei]:
            if len(gt[ei]['local_rew']) > 1:
                dup_rew_counter.update([len(gt[ei]['local_rew'])])
                if verbose:
                    verbose_output.append(
                        'Idx: {} - EventId: {} - Duplicate in Reward: {}'.
                        format(gt[ei]['i'], ei, gt[ei]['local_rew']))
            else:
                if 'azure_data' in gt[ei]:
                    if len(gt[ei]['azure_data']) > 1:
                        dup_azure_counter.update([len(gt[ei]['azure_data'])])
                        if verbose:
                            verbose_output.append(
                                'Idx: {} - EventId: {} - Duplicate in Azure: {}'
                                .format(gt[ei]['i'], ei, gt[ei]['azure_data']))
                    else:
                        a = float(gt[ei]['local_rew'][0])
                        b = float(gt[ei]['azure_data'][0][0])
                        if abs(a + b) > max(1e-7 * max(abs(a), abs(b)), 1e-6):
                            err_rewards_idx.append(gt[ei]['i'])
                            if verbose:
                                verbose_output.append(
                                    'Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'
                                    .format(gt[ei]['i'], ei,
                                            gt[ei]['local_rew'][0],
                                            gt[ei]['azure_data'][0]))
                else:
                    no_events_idx.append(gt[ei]['i'])
                    if verbose:
                        verbose_output.append(
                            'Idx: {} - EventId: {} - Ranking missing from Azure'
                            .format(gt[ei]['i'], ei))
        else:
            no_rewards_idx.append(gt[ei]['i'])
            if verbose:
                verbose_output.append(
                    'Idx: {} - EventId: {} - Reward missing from local'.format(
                        gt[ei]['i'], ei))
    ds_parse.update_progress(i + 1, len(gt), 'Evaluating differences - ')
    print()

    for x in verbose_output:
        print(x)

    print('\nComputing summary stats...')
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {y[0]: y[1] for y in azure_data}

    dup_azure = sum((x - 1) * dup_azure_counter[x] for x in dup_azure_counter)
    dup_rew = sum((x - 1) * dup_rew_counter[x] for x in dup_rew_counter)
    if verbose:
        print('-----' * 10)
        print('Missing events indexes (1-based indexing)\n{}'.format(
            no_events_idx))
        print('-----' * 10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(
            no_rewards_idx))
        print('-----' * 10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(
            err_rewards_idx))
    print('-----' * 10)
    print('Events in local_rank: {} (Duplicates: {})'.format(
        len_local_rank, dup_rank))
    print('Events in local_rew: {} (Duplicates: {} - {})'.format(
        len(local_rew), dup_rew, dup_rew_counter))
    print('Events in azure_data: {} (Duplicates: {} - {})'.format(
        len(azure_data), dup_azure, dup_azure_counter))
    print('-----' * 10)
    print('Intersection local_rank/local_rew:',
          sum(1 for x in rew_dict if x in gt))
    print('Intersection local_rank/azure_data:',
          sum(1 for x in azure_dict if x in gt))
    print('Missing EventIds from local: {}'.format(ei_miss_local))
    print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),
                                                     len_local_rank),
              end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),
                                                     len_local_rank),
              end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----' * 10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----' * 10)
    print('Elapsed time: ', time.time() - t)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx,
                             hist_bin,
                             label='Wrong reward',
                             color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx', a)
            if no_events_idx:
                b = plt.hist(no_events_idx,
                             hist_bin,
                             label='No rank',
                             color='xkcd:blue')
                if verbose:
                    print('no_events_idx', b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx,
                             hist_bin,
                             label='No local reward',
                             color='xkcd:red')
                if verbose:
                    print('no_rewards_idx', c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')
Esempio n. 4
0
def compute_estimates(log_fp, cats_transformer=None):
    # Init estimators
    online = ips_snips.Estimator()
    baseline1 = ips_snips.Estimator()
    baselineR = ips_snips.Estimator()
    online_mle = mle.Estimator()
    baseline1_mle = mle.Estimator()
    baselineR_mle = mle.Estimator()
    online_cressieread = cressieread.Estimator()
    baseline1_cressieread = cressieread.Estimator()
    baselineR_cressieread = cressieread.Estimator()

    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i, x in enumerate(
            gzip.open(log_fp, 'rb') if log_fp.
            endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        if (i + 1) % 10000 == 0:
            if log_fp.endswith('.gz'):
                ds_parse.update_progress(i + 1)
            else:
                ds_parse.update_progress(bytes_count, tot_bytes)

        # parse dsjson file
        if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
            data = ds_parse.json_cooked(x)

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # Update estimators with tuple (p_log, r, p_pred)
            online.add_example(data['p'], r, data['p'])
            baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR.add_example(data['p'], r, 1 / data['num_a'])

            online_mle.add_example(data['p'], r, data['p'])
            baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR_mle.add_example(data['p'], r, 1 / data['num_a'])

            online_cressieread.add_example(data['p'], r, data['p'])
            baseline1_cressieread.add_example(data['p'], r,
                                              1 if data['a'] == 1 else 0)
            baselineR_cressieread.add_example(data['p'], r, 1 / data['num_a'])

            evts += 1

        if x.startswith(b'{"_label_ca":') and x.strip().endswith(b'}'):
            data = ds_parse.json_cooked_continuous_actions(x)
            if cats_transformer is None:
                raise RuntimeError(
                    "Not all of the required arguments for running with continuous actions have been provided."
                )
            # passing logged action as predicted action to transformer
            data = cats_transformer.transform(data, data['a'])
            # passing baseline action as predicted action to transformer
            data_baseline1 = cats_transformer.transform(
                data, cats_transformer.get_baseline1_prediction())

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # Update estimators with tuple (p_log, r, p_pred)
            online.add_example(data['p'], r, data['p'])
            baseline1.add_example(data['p'], r, data_baseline1['pred_p'])
            baselineR.add_example(data['p'], r,
                                  1.0 / cats_transformer.continuous_range)

            online_mle.add_example(data['p'], r, data['p'])
            baseline1_mle.add_example(data['p'], r, data_baseline1['pred_p'])
            baselineR_mle.add_example(data['p'], r,
                                      1.0 / cats_transformer.continuous_range)

            online_cressieread.add_example(data['p'], r, data['p'])
            baseline1_cressieread.add_example(data['p'], r,
                                              data_baseline1['pred_p'])
            baselineR_cressieread.add_example(
                data['p'], r, 1.0 / cats_transformer.continuous_range)

            evts += 1

    if log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(i + 1)
    else:
        len_text = ds_parse.update_progress(bytes_count, tot_bytes)

    print('\nProcessed {} events out of {} lines'.format(evts, i + 1))

    print('online_ips:', online.get_estimate('ips'))

    print('baseline1_ips:', baseline1.get_estimate('ips'))
    print('baseline1 gaussian ci:', baseline1.get_interval('gaussian'))
    print('baseline1 clopper pearson ci:',
          baseline1.get_interval('clopper-pearson'))

    print('baselineR_ips:', baselineR.get_estimate('ips'))
    print('baselineR gaussian ci:', baselineR.get_interval('gaussian'))
    print('baselineR clopper pearson ci:',
          baselineR.get_interval('clopper-pearson'))

    print('online_snips:', online.get_estimate('snips'))
    print('baseline1_snips:', baseline1.get_estimate('snips'))
    print('baselineR_snips:', baselineR.get_estimate('snips'))

    print('online_mle:', online_mle.get_estimate())
    print('baseline1_mle:', baseline1_mle.get_estimate())
    print('baselineR_mle:', baselineR_mle.get_estimate())

    print('online_cressieread:', online_cressieread.get_estimate())
    print('baseline1_cressieread:', baseline1_cressieread.get_estimate())
    print('baselineR_cressieread:', baselineR_cressieread.get_estimate())
Esempio n. 5
0
def create_stats(log_fp, dashboard_file, predictions_files=None):

    t0 = time.time()

    if predictions_files is None:
        print('Searching prediction files for log file: {}'.format(log_fp))
        predictions_files = []
        for fn in os.scandir(os.path.dirname(log_fp)):
            if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'):
                predictions_files.append(fn.path)

    # load predictions from predictions_files
    pred = {}
    for pred_fp in predictions_files:
        if os.path.isfile(pred_fp):
            name = pred_fp.split('.')[-2]   # check that policy name is encoded in file_name
            if name:
                pred[name] = [x.strip() for x in open(pred_fp) if x.strip()]
                print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp))
            else:
                print('Name is not valid - Skip: {}'.format(pred_fp))
        else:
            print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp))
            sys.exit()

    if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred):
        print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred]))
        sys.exit()

    d = {}
    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i,x in enumerate(gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        if (i+1) % 1000 == 0:
            if log_fp.endswith('.gz'):
                ds_parse.update_progress(i+1)
            else:
                ds_parse.update_progress(bytes_count,tot_bytes)

        if x.startswith(b'{"_label_cost":'):
            data = ds_parse.json_cooked(x)

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            ############################### Aggregates for each bin ######################################
            #
            # 'n':   IPS of numerator
            # 'N':   total number of samples in bin from log (IPS = n/N)
            # 'd':   IPS of denominator (SNIPS = n/d)
            # 'Ne':  number of samples in bin when off-policy agrees with log policy
            # 'c':   max abs. value of numerator's items (needed for Clopper-Pearson confidence intervals)
            # 'SoS': sum of squares of numerator's items (needed for Gaussian confidence intervals)
            #
            #################################################################################################

            # binning timestamp every 5 min
            ts_bin = get_ts_5min_bin(data['ts'])

            # initialize aggregates for ts_bin
            if ts_bin not in d:
                d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'N':0,'d':0},
                                                     'baseline1' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0},
                                                     'baselineRand' : {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}})
                for name in pred:
                    d[ts_bin][name] = {'n':0.,'N':0,'d':0.,'Ne':0,'c':0.,'SoS':0}

            # update aggregates for online and baseline policies
            d[ts_bin]['online']['d'] += 1
            d[ts_bin]['online']['N'] += 1
            d[ts_bin]['baselineRand']['N'] += 1
            d[ts_bin]['baseline1']['N'] += 1

            d[ts_bin]['baselineRand']['Ne'] += 1
            d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a']
            if data['a'] == 1:
                d[ts_bin]['baseline1']['Ne'] += 1
                d[ts_bin]['baseline1']['d'] += 1/data['p']

            if r != 0:
                d[ts_bin]['online']['n'] += r
                d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a']
                d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a'])
                d[ts_bin]['baselineRand']['SoS'] += (r/data['p']/data['num_a'])**2
                if data['a'] == 1:
                    d[ts_bin]['baseline1']['n'] += r/data['p']
                    d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p'])
                    d[ts_bin]['baseline1']['SoS'] += (r/data['p'])**2                   

            # update aggregates for additional policies from predictions
            for name in pred:
                pred_prob = get_prediction_prob(data['a']-1, pred[name][evts])     # a-1: 0-index action
                d[ts_bin][name]['N'] += 1
                if pred_prob > 0:
                    p_over_p = pred_prob/data['p']
                    d[ts_bin][name]['d'] += p_over_p
                    d[ts_bin][name]['Ne'] += 1
                    if r != 0:
                        d[ts_bin][name]['n'] += r*p_over_p
                        d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p)
                        d[ts_bin][name]['SoS'] += (r*p_over_p)**2
            evts += 1
    if log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(i+1)
    else:
        len_text = ds_parse.update_progress(bytes_count,tot_bytes)
    sys.stdout.write("\r" + " "*len_text + "\r")
    sys.stdout.flush()

    print('Read {} lines - Processed {} events'.format(i+1,evts))
    if any(len(pred[name]) != evts for name in pred):
        print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts))
        sys.exit()

    output_dashboard_data(d, dashboard_file)
    
    print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))
Esempio n. 6
0
def print_stats(local_fp, azure_path, verbose=False, plot_hist=False, hist_bin=100):

    t = time.time()

    gt = {}
    len_local_rank = 0
    dup_rank = 0
    local_rew = []
    lines_errs = 0
    err_codes = collections.Counter()
    bytes_count = 0
    tot_bytes = os.path.getsize(local_fp)
    for i,x in enumerate(open(local_fp, encoding='utf-8')):
        bytes_count += len(x)
        if (i+1) % 10000 == 0:
            ds_parse.update_progress(bytes_count,tot_bytes,'Loading Local file: {} - '.format(local_fp))
        if 'status_code:200' in x:
            if '/rank/' in x and '"eventId":"' in x:
                ei = ds_parse.local_rank(x)
                len_local_rank += 1
                if ei in gt:
                    dup_rank += 1
                else:
                    gt[ei] = {'i': len_local_rank}
            elif '/reward/' in x and 'content:' in x:
                ei,r = ds_parse.local_reward(x)
                local_rew.append((ei,r))
                gt[ei].setdefault('local_rew',[]).append(r)
            else:
                lines_errs += 1
        else:
            err_codes.update([ds_parse.extract_field(x,'status_code:','\t')])
    ds_parse.update_progress(tot_bytes,tot_bytes,'Loading Local file: {} - '.format(local_fp))

    print('\n\nLoading Azure files...')
    if os.path.isdir(azure_path):
        files = [azure_fp.path for azure_fp in scantree(azure_path) if azure_fp.name.endswith('.json')]
    else:
        files = [azure_path]

    verbose_output = []

    ei_miss_local = 0
    azure_data = []
    for ii,azure_fp in enumerate(files):
        bytes_count = 0
        tot_bytes = os.path.getsize(azure_fp)
        for i,x in enumerate(gzip.open(azure_fp, 'rb') if azure_fp.endswith('.gz') else open(azure_fp, 'rb')):
            bytes_count += len(x)
            if (i+1) % 10000 == 0:
                if azure_fp.endswith('.gz'):
                    ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
                else:
                    ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp))

            if x.startswith(b'{"_label_cost":'):
                data = ds_parse.json_cooked(x)
                ei = str(data['ei'], 'utf-8')
                c = str(data['cost'], 'utf-8')
                azure_data.append((ei, c))
                if ei not in gt:
                    ei_miss_local += 1
                    if verbose:
                        verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Local'.format(len(azure_data),ei))
                else:
                    gt[ei].setdefault('azure_data',[]).append((c, data['ts']))
        if azure_fp.endswith('.gz'):
            ds_parse.update_progress(i+1,prefix='File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
        else:
            ds_parse.update_progress(bytes_count,tot_bytes,'File {}/{}: {} - '.format(ii+1,len(files),azure_fp))
        print()
    print()

    dup_azure_counter = collections.Counter()
    dup_rew_counter = collections.Counter()
    err_rewards_idx = []
    no_events_idx = []
    no_rewards_idx = []
    for i,ei in enumerate(gt):
        if (i+1) % 10000 == 0:
            ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ')
        if 'local_rew' in gt[ei]:
            if len(gt[ei]['local_rew']) > 1:
                dup_rew_counter.update([len(gt[ei]['local_rew'])])
                if verbose:
                    verbose_output.append('Idx: {} - EventId: {} - Duplicate in Reward: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew']))
            else:
                if 'azure_data' in gt[ei]:
                    if len(gt[ei]['azure_data']) > 1:
                        dup_azure_counter.update([len(gt[ei]['azure_data'])])
                        if verbose:
                            verbose_output.append('Idx: {} - EventId: {} - Duplicate in Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['azure_data']))
                    else:
                        a = float(gt[ei]['local_rew'][0])
                        b = float(gt[ei]['azure_data'][0][0])
                        if abs(a+b) > max(1e-7 * max(abs(a), abs(b)), 1e-6):
                            err_rewards_idx.append(gt[ei]['i'])
                            if verbose:
                                verbose_output.append('Idx: {} - EventId: {} - Error in reward: Local: {} Azure: {}'.format(gt[ei]['i'],ei,gt[ei]['local_rew'][0],gt[ei]['azure_data'][0]))
                else:
                    no_events_idx.append(gt[ei]['i'])
                    if verbose:
                        verbose_output.append('Idx: {} - EventId: {} - Ranking missing from Azure'.format(gt[ei]['i'],ei))
        else:
            no_rewards_idx.append(gt[ei]['i'])
            if verbose:
                verbose_output.append('Idx: {} - EventId: {} - Reward missing from local'.format(gt[ei]['i'],ei))
    ds_parse.update_progress(i+1,len(gt),'Evaluating differences - ')
    print()

    for x in verbose_output:
        print(x)

    print('\nComputing summary stats...')
    rew_dict = {y[0]: y[1] for y in local_rew}
    azure_dict = {y[0]: y[1] for y in azure_data}

    dup_azure = sum((x-1)*dup_azure_counter[x] for x in dup_azure_counter)
    dup_rew = sum((x-1)*dup_rew_counter[x] for x in dup_rew_counter)
    if verbose:
        print('-----'*10)
        print('Missing events indexes (1-based indexing)\n{}'.format(no_events_idx))
        print('-----'*10)
        print('Missing local rewards indexes (1-based indexing)\n{}'.format(no_rewards_idx))
        print('-----'*10)
        print('Wrong rewards indexes (1-based indexing)\n{}'.format(err_rewards_idx))
    print('-----'*10)
    print('Events in local_rank: {} (Duplicates: {})'.format(len_local_rank, dup_rank))
    print('Events in local_rew: {} (Duplicates: {} - {})'.format(len(local_rew), dup_rew, dup_rew_counter))
    print('Events in azure_data: {} (Duplicates: {} - {})'.format(len(azure_data), dup_azure, dup_azure_counter))
    print('-----'*10)
    print('Intersection local_rank/local_rew:',sum(1 for x in rew_dict if x in gt))
    print('Intersection local_rank/azure_data:',sum(1 for x in azure_dict if x in gt))
    print('Missing EventIds from local: {}'.format(ei_miss_local))
    print('Missing EventIds from azure: {}'.format(len(no_events_idx)), end='')
    if no_events_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_events_idx),len_local_rank), end='')
    print('\nMissing Local Rewards: {}'.format(len(no_rewards_idx)), end='')
    if no_rewards_idx:
        print(' (oldest 1-base index: {}/{})'.format(min(no_rewards_idx),len_local_rank), end='')
    print('\nWrong rewards: {}'.format(len(err_rewards_idx)))
    print('-----'*10)
    print('status_codes errors: {}'.format(err_codes.most_common()))
    print('Lines skipped in Local file: {}'.format(lines_errs))
    print('-----'*10)
    print('Elapsed time: ',time.time()-t)
    if plot_hist:
        if err_rewards_idx or no_events_idx or no_rewards_idx:
            plt.rcParams.update({'font.size': 16})  # General font size
            if err_rewards_idx:
                a = plt.hist(err_rewards_idx, hist_bin, label='Wrong reward', color='xkcd:orange')
                if verbose:
                    print('err_rewards_idx',a)
            if no_events_idx:
                b = plt.hist(no_events_idx, hist_bin, label='No rank', color='xkcd:blue')
                if verbose:
                    print('no_events_idx',b)
            if no_rewards_idx:
                c = plt.hist(no_rewards_idx, hist_bin, label='No local reward', color='xkcd:red')
                if verbose:
                    print('no_rewards_idx',c)
            plt.title('Missing/Wrong rank and reward requests', fontsize=20)
            plt.xlabel('Request index', fontsize=18)
            plt.ylabel('Bin Count', fontsize=18)
            plt.legend()
            plt.show()
        else:
            print('Nothing to plot! All is good!')
Esempio n. 7
0
def compute_estimates(log_fp):
    # Init estimators
    online = ips_snips.Estimator()
    baseline1 = ips_snips.Estimator()
    baselineR = ips_snips.Estimator()
    online_mle = mle.Estimator()
    baseline1_mle = mle.Estimator()
    baselineR_mle = mle.Estimator()

    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i, x in enumerate(
            gzip.open(log_fp, 'rb') if log_fp.
            endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        if (i + 1) % 10000 == 0:
            if log_fp.endswith('.gz'):
                ds_parse.update_progress(i + 1)
            else:
                ds_parse.update_progress(bytes_count, tot_bytes)

        # parse dsjson file
        if x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
            data = ds_parse.json_cooked(x)

            if data['skipLearn']:
                continue

            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # Update estimators with tuple (p_log, r, p_pred)
            online.add_example(data['p'], r, data['p'])
            baseline1.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR.add_example(data['p'], r, 1 / data['num_a'])

            online_mle.add_example(data['p'], r, data['p'])
            baseline1_mle.add_example(data['p'], r, 1 if data['a'] == 1 else 0)
            baselineR_mle.add_example(data['p'], r, 1 / data['num_a'])

            evts += 1

    if log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(i + 1)
    else:
        len_text = ds_parse.update_progress(bytes_count, tot_bytes)

    print('\nProcessed {} events out of {} lines'.format(evts, i + 1))

    print('online_ips:', online.get_estimate('ips'))
    print('baseline1_ips:', baseline1.get_estimate('ips'))
    print('baselineR_ips:', baselineR.get_estimate('ips'))

    print('online_snips:', online.get_estimate('snips'))
    print('baseline1_snips:', baseline1.get_estimate('snips'))
    print('baselineR_snips:', baselineR.get_estimate('snips'))

    print('online_mle:', online_mle.get_estimate())
    print('baseline1_mle:', baseline1_mle.get_estimate())
    print('baselineR_mle:', baselineR_mle.get_estimate())
Esempio n. 8
0
def create_act_d(l):
    act_d = {}
    data = []
    ctr_all = {}
    for i,x in enumerate(l):
        js = json.loads(x[1])
        if i == 0:
            print('These are the actions features from your first event:\n',js['c']['_multi'])
            actions_names_fields = input('\nEnter a (comma separated) list of JSON fields used to extract the action name:').split(',')
            try:
                sep1,sep2 = input('Enter separators to parse the action name string keeping only substring between the separators (comma separated):').split(',')
            except:
                print('Separators not correctly entered - not using separators')
                sep1,sep2 = '',''
            print('Start parsing...')
        
        if max(js['p']) - min(js['p']) > 1e-5:
            model_ind = js['a'][np.argmax(js['p'])]-1
            vw_model = js.get('VWState', {}).get('m', 'N/A')
        else:
            model_ind = -1
            a_mod = None
            vw_model = 'N/A'
        actions = set()
        temp = []
        for j,y in enumerate(js['c']['_multi']):

            ########### Parsing action features to extract name ########
            action_name = y[actions_names_fields[0]]
            for field in actions_names_fields[1:]:
                action_name = action_name[field]
            if sep1:
                action_name = action_name.split(sep1,1)[1]
            if sep2:
                action_name = action_name.split(sep2,1)[0]
            ############################################################
            
            is_firstAction = int(j > 0)
            if action_name not in act_d:
                act_d[action_name] = (len(act_d),[],[])
            if action_name not in actions:
                actions.add(action_name)
                act_d[action_name][is_firstAction+1].append(i)
            if j == js['_labelIndex']:
                a = act_d[action_name][0]
            if j == model_ind:
                a_mod = act_d[action_name][0]
            temp.append(action_name)
            if act_d[action_name][0] not in ctr_all:
                ctr_all[act_d[action_name][0]] = [0,0,0,0,0,action_name,collections.Counter()]
            ctr_all[act_d[action_name][0]][2] += 1
        data.append((a, a_mod, js['_label_cost'], model_ind, js['_label_Action'], js['Timestamp'],temp, vw_model))
        ctr_all[a][1] += 1
        ctr_all[a][4] += 1/js['_label_probability']
        ctr_all[a][-1].update([-js['_label_cost']])
        if js['_label_cost'] != 0:
            ctr_all[a][0] -= js['_label_cost']
            ctr_all[a][3] -= js['_label_cost']/js['_label_probability']

        if (i+1) % 1000 == 0:
            ds_parse.update_progress(i+1,len(l))
    ds_parse.update_progress(i+1,len(l))

    print('\n\nActionId,Rewards,Choosen,Available,Rew. IPS,Choosen IPS,IPS,SNIPS,ActionName')
    for a in range(len(ctr_all)):
        print(','.join(map(str,[a]+ctr_all[a][:-2]+[ctr_all[a][3]/max(ctr_all[a][2],1),ctr_all[a][3]/max(ctr_all[a][4],1)]+[ctr_all[a][-2]])))
    
    print('\nMost Common Rewards')
    rew_list = sorted({x[0] for a in range(len(ctr_all)) for x in ctr_all[a][-1].most_common(10)})
    print(','.join(map(str, ['ActionId']+rew_list)))
    for a in range(len(ctr_all)):
        print(','.join(map(str, [a]+[ctr_all[a][-1][r] for r in rew_list])))
        
    return act_d,data,ctr_all
Esempio n. 9
0
def create_act_d(l):
    act_d = {}
    data = []
    ctr_all = {}
    for i, x in enumerate(l):
        js = json.loads(x[1])
        if i == 0:
            print('These are the actions features from your first event:\n',
                  js['c']['_multi'])
            actions_names_fields = input(
                '\nEnter a (comma separated) list of JSON fields used to extract the action name:'
            ).split(',')
            try:
                sep1, sep2 = input(
                    'Enter separators to parse the action name string keeping only substring between the separators (comma separated):'
                ).split(',')
            except:
                print(
                    'Separators not correctly entered - not using separators')
                sep1, sep2 = '', ''
            print('Start parsing...')

        if max(js['p']) - min(js['p']) > 1e-5:
            model_ind = js['a'][np.argmax(js['p'])] - 1
            vw_model = js.get('VWState', {}).get('m', 'N/A')
        else:
            model_ind = -1
            a_mod = None
            vw_model = 'N/A'
        actions = set()
        temp = []
        for j, y in enumerate(js['c']['_multi']):

            ########### Parsing action features to extract name ########
            action_name = y[actions_names_fields[0]]
            for field in actions_names_fields[1:]:
                action_name = action_name[field]
            if sep1:
                action_name = action_name.split(sep1, 1)[1]
            if sep2:
                action_name = action_name.split(sep2, 1)[0]
            ############################################################

            is_firstAction = int(j > 0)
            if action_name not in act_d:
                act_d[action_name] = (len(act_d), [], [])
            if action_name not in actions:
                actions.add(action_name)
                act_d[action_name][is_firstAction + 1].append(i)
            if j == js['_labelIndex']:
                a = act_d[action_name][0]
            if j == model_ind:
                a_mod = act_d[action_name][0]
            temp.append(action_name)
            if act_d[action_name][0] not in ctr_all:
                ctr_all[act_d[action_name][0]] = [
                    0, 0, 0, 0, 0, action_name,
                    collections.Counter()
                ]
            ctr_all[act_d[action_name][0]][2] += 1
        data.append((a, a_mod, js['_label_cost'], model_ind,
                     js['_label_Action'], js['Timestamp'], temp, vw_model))
        ctr_all[a][1] += 1
        ctr_all[a][4] += 1 / js['_label_probability']
        ctr_all[a][-1].update([-js['_label_cost']])
        if js['_label_cost'] != 0:
            ctr_all[a][0] -= js['_label_cost']
            ctr_all[a][3] -= js['_label_cost'] / js['_label_probability']

        if (i + 1) % 1000 == 0:
            ds_parse.update_progress(i + 1, len(l))
    ds_parse.update_progress(i + 1, len(l))

    print(
        '\n\nActionId,Rewards,Choosen,Available,Rew. IPS,Choosen IPS,IPS,SNIPS,ActionName'
    )
    for a in range(len(ctr_all)):
        print(','.join(
            map(str, [a] + ctr_all[a][:-2] + [
                ctr_all[a][3] / max(ctr_all[a][2], 1),
                ctr_all[a][3] / max(ctr_all[a][4], 1)
            ] + [ctr_all[a][-2]])))

    print('\nMost Common Rewards')
    rew_list = sorted({
        x[0]
        for a in range(len(ctr_all)) for x in ctr_all[a][-1].most_common(10)
    })
    print(','.join(map(str, ['ActionId'] + rew_list)))
    for a in range(len(ctr_all)):
        print(','.join(map(str, [a] + [ctr_all[a][-1][r] for r in rew_list])))

    return act_d, data, ctr_all
Esempio n. 10
0
def create_stats(log_fp,
                 log_type='cb',
                 d=None,
                 predictions_files=None,
                 is_summary=False,
                 report_progress=True):

    t0 = time.time()
    if d is None:
        d = {}

    if predictions_files is None:
        print('Searching prediction files for log file: {}'.format(log_fp))
        predictions_files = []
        for fn in os.scandir(os.path.dirname(log_fp)):
            if fn.path.startswith(log_fp + '.') and fn.name.endswith('.pred'):
                predictions_files.append(fn.path)

    # load predictions from predictions_files
    pred = {}
    for pred_fp in predictions_files:
        if os.path.isfile(pred_fp):
            if is_summary:
                name = pred_fp.split('/')[-1].split('.')[-2]
            else:
                name = pred_fp.split('.')[
                    -2]  # check that policy name is encoded in file_name
            if name:
                if log_type == 'cb':
                    pred[name] = [
                        x.strip() for x in open(pred_fp) if x.strip()
                    ]
                elif log_type == 'ccb':
                    with open(pred_fp) as f:
                        pred[name] = []
                        slot = []
                        for x in f:
                            x = x.strip()
                            if x:
                                slot.append(x)
                            else:
                                pred[name].append(slot)
                                slot = []
                print('Loaded {} predictions from {}'.format(
                    len(pred[name]), pred_fp))
            else:
                print('Name is not valid - Skip: {}'.format(pred_fp))
        else:
            print('Error loading policy predictions. Pred file not found: {}'.
                  format(pred_fp))
            sys.exit()

    if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(
            len(pred[name]) for name in pred):
        print('Error: Prediction file length ({}) must be equal for all files'.
              format([len(pred[name]) for name in pred]))
        sys.exit()

    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    for i, x in enumerate(
            gzip.open(log_fp, 'rb') if log_fp.
            endswith('.gz') else open(log_fp, 'rb')):
        if report_progress:
            # display progress
            bytes_count += len(x)
            if (i + 1) % 1000 == 0:
                if log_fp.endswith('.gz'):
                    ds_parse.update_progress(i + 1)
                else:
                    ds_parse.update_progress(bytes_count, tot_bytes)

        data = None

        if log_type == 'ccb':
            if x.startswith(b'{"Timestamp"') and x.strip().endswith(b'}'):
                data = ds_parse.ccb_json_cooked(x)
                aggregates_ccb_data(data, pred, d, evts)

        elif log_type == 'cb':
            if is_summary:
                data = json.loads(x.decode("utf-8"))
            elif x.startswith(b'{"_label_cost":') and x.strip().endswith(b'}'):
                data = ds_parse.json_cooked(x, do_decode=True)

            # Skip wrongly formated lines or not activated lines
            if data is None or data['skipLearn']:
                continue

            aggregates_cb_data(data, pred, d, evts)
        evts += 1

    if report_progress:
        if log_fp.endswith('.gz'):
            len_text = ds_parse.update_progress(i + 1)
        else:
            len_text = ds_parse.update_progress(bytes_count, tot_bytes)
        sys.stdout.write("\r" + " " * len_text + "\r")
        sys.stdout.flush()

    print('Read {} lines - Processed {} events'.format(i + 1, evts))

    if any(len(pred[name]) != evts for name in pred):
        print(
            'Error: Prediction file length ({}) is different from number of events in log file ({})'
            .format([len(pred[name]) for name in pred], evts))
        sys.exit()
    print('Total Elapsed Time: {:.1f} sec.'.format(time.time() - t0))
    return d
Esempio n. 11
0
def create_stats(log_fp, dashboard_file, predictions_files=None):

    t0 = time.time()

    if predictions_files is None:
        print('Searching prediction files for log file: {}'.format(log_fp))
        predictions_files = []
        for fn in os.scandir(os.path.dirname(log_fp)):
            if fn.path.startswith(log_fp+'.') and fn.name.endswith('.pred'):
                predictions_files.append(fn.path)

    # load predictions from predictions_files
    pred = {}
    for pred_fp in predictions_files:
        if os.path.isfile(pred_fp):
            name = pred_fp.split('.')[-2]   # check that policy name is encoded in file_name
            if name:
                pred[name] = [x.strip() for x in open(pred_fp) if x.strip()]
                print('Loaded {} predictions from {}'.format(len(pred[name]),pred_fp))
            else:
                print('Name is not valid - Skip: {}'.format(pred_fp))
        else:
            print('Error loading policy predictions. Pred file not found: {}'.format(pred_fp))
            sys.exit()

    if len(pred) > 1 and min(len(pred[name]) for name in pred) != max(len(pred[name]) for name in pred):
        print('Error: Prediction file length ({}) must be equal for all files'.format([len(pred[name]) for name in pred]))
        sys.exit()

    d = {}
    print('Processing: {}'.format(log_fp))
    bytes_count = 0
    tot_bytes = os.path.getsize(log_fp)
    evts = 0
    i = 0
    for x in (gzip.open(log_fp, 'rb') if log_fp.endswith('.gz') else open(log_fp, 'rb')):
        # display progress
        bytes_count += len(x)
        i += 1
        if i % 5000 == 0:
            if log_fp.endswith('.gz'):
                if i % 20000 == 0:
                    print('.', end='', flush=True)
                    if i % 1000000 == 0:
                        print(' - Iter:',i)
            else:
                ds_parse.update_progress(bytes_count,tot_bytes)

        if x.startswith(b'{"_label_cost":'):
            data = ds_parse.json_cooked(x)
            r = 0 if data['cost'] == b'0' else -float(data['cost'])

            # binning time stamp every 5 min
            ts_bin = get_ts_5min_bin(data['ts'])

            # initialize aggregate for ts_bin
            if ts_bin not in d:
                d[ts_bin] = collections.OrderedDict({'online' : {'n':0,'d':0},
                                                     'baseline1' : {'n':0.,'d':0.,'c':0.,'N':0},
                                                     'baselineRand' : {'n':0.,'d':0.,'c':0.,'N':0}})
                for name in pred:
                    d[ts_bin][name] = {'n':0.,'d':0.,'c':0.,'N':0}

            # online and baseline policies
            d[ts_bin]['online']['d'] += 1
            d[ts_bin]['baselineRand']['N'] += 1
            d[ts_bin]['baselineRand']['d'] += 1/data['p']/data['num_a']
            if data['a'] == 1:
                d[ts_bin]['baseline1']['N'] += 1
                d[ts_bin]['baseline1']['d'] += 1/data['p']

            if r != 0:
                d[ts_bin]['online']['n'] += r
                d[ts_bin]['baselineRand']['n'] += r/data['p']/data['num_a']
                d[ts_bin]['baselineRand']['c'] = max(d[ts_bin]['baselineRand']['c'], r/data['p']/data['num_a'])
                if data['a'] == 1:
                    d[ts_bin]['baseline1']['n'] += r/data['p']
                    d[ts_bin]['baseline1']['c'] = max(d[ts_bin]['baseline1']['c'], r/data['p'])

            # additional policies from predictions
            for name in pred:
                pred_prob = get_prediction_prob(data['a']-1, pred[name][evts])     # a-1: 0-index action
                
                if pred_prob > 0:
                    p_over_p = pred_prob/data['p']
                    d[ts_bin][name]['d'] += p_over_p
                    d[ts_bin][name]['N'] += 1
                    if r != 0:
                        d[ts_bin][name]['n'] += r*p_over_p
                        d[ts_bin][name]['c'] = max(d[ts_bin][name]['c'], r*p_over_p)
            evts += 1
    if not log_fp.endswith('.gz'):
        len_text = ds_parse.update_progress(bytes_count,tot_bytes)
        sys.stdout.write("\r" + " "*len_text + "\r")
        sys.stdout.flush()

    print('Processed {} events'.format(evts))
    if any(len(pred[name]) != evts for name in pred):
        print('Error: Prediction file length ({}) is different from number of events in log file ({})'.format([len(pred[name]) for name in pred],evts))
        sys.exit()

    output_dashboard_data(d, dashboard_file)
    
    print('Total Elapsed Time: {:.1f} sec.'.format(time.time()-t0))