Beispiel #1
0
def do_json(args):
  versions = {}
  for path in args.logdirs:
    if os.path.isdir(path):
      for root, dirs, files in os.walk(path):
        version = os.path.basename(root)
        if version not in versions: versions[version] = {}
        for filename in files:
          if filename.endswith(".txt"):
            m = re.match(r'^([^#]+)(#.*)?\.txt$', filename)
            domain = m.group(1)
            if domain not in versions[version]: versions[version][domain] = {}
            read_stats(os.path.join(root, filename),
                       versions[version][domain], args)
  for version, domains in versions.items():
    if args.aggregate:
      create_total_page_stats(domains, args)
    for domain, entries in domains.items():
      stats = []
      for name, value in entries.items():
        # We don't want the calculated sum in the JSON file.
        if name == "Sum": continue
        entry = [name]
        for x in ['time_list', 'count_list']:
          s = statistics(entries[name][x])
          entry.append(round(s['average'], 1))
          entry.append(round(s['ci']['abs'], 1))
          entry.append(round(s['ci']['perc'], 2))
        stats.append(entry)
      domains[domain] = stats
  print json.dumps(versions, separators=(',', ':'))
Beispiel #2
0
def bootstrap(m, query, fn):
    L,R= get_data(query)
    stats = []
    for _ in range(50):
        Lb, Rb = make_boot_sample(L, R)
        stats.append(fn(m, Lb, Rb))
    return avg(stats),stddev(stats)
Beispiel #3
0
def get_peruser_stats(df,
                      metrics,
                      mean_suffix='mean',
                      std_suffix='std',
                      test_group_column='test_group',
                      control='CONTROL',
                      test='TEST',
                      alpha=0.05):
    stats = list()
    for metric in metrics:
        test_mean, control_mean, diff, confint, zstat, pvalue = get_peruser_diff_zstat(df,
                            'units',
                            '{0} {1}'.format(metric, mean_suffix),
                            '{0} {1}'.format(metric, std_suffix),
                            test_group_column=test_group_column,
                            test=test,
                            control=control,
                            alpha=alpha,
                            )
        stats.append(OrderedDict(
                        metric=metric,
                        test_mean=test_mean,
                        control_mean=control_mean,
                        diff=diff,
                        lcl=confint[0],
                        ucl=confint[1],
                        zstat=zstat,
                        pvalue=pvalue,
                        ))


    return pd.DataFrame(stats)
Beispiel #4
0
def parse_model_file(filepath):
    f = open(filepath, 'r')
    lines = [l.strip() for l in f.readlines() if l.strip() != '']
    f.close()
    started_stats = False
    active_joints = None
    vid_length = None
    num_rests = None
    warp_indices = None
    stats = []
    for line in lines:
        if line == '--stats--':
            started_stats = True
            continue
        if started_stats:
            stat_dict = eval(line)
            stats.append(stat_dict)
            continue
        if line.startswith('ActiveJoints='):
            active_joints = eval(line.split('=', 1)[1])
            continue
        if line.startswith('VidLength='):
            vid_length = eval(line.split('=', 1)[1])
            continue
        if line.startswith('NumRests='):
            num_rests = eval(line.split('=', 1)[1])
            continue
        if line.startswith('WarpIndices='):
            warp_indices = eval(line.split('=', 1)[1])
            continue
        if line.startswith('ConnectionLengths='):
            connection_lengths = eval(line.split('=', 1)[1])
            continue

    return vid_length, active_joints, num_rests, warp_indices, connection_lengths, stats
Beispiel #5
0
def evaluate_by_lc(ar_p, ar_t, ar_lc, mask, nodata_lc, out_dir):
    '''
    Return a dataframe of the agreement between ar_p and ar_t by class in ar_lc. 
    '''
    print 'Getting unique values...'
    classes = np.unique(ar_lc[(ar_lc != nodata_lc) & (ar_lc != 0)]) # Take 0's out too
    #import pdb; pdb.set_trace()
    stats = []
    for lc in classes:
        print 'Calculating statistics for class ', lc
        this_mask = (ar_lc == lc) & mask
        this_t = ar_t[this_mask]
        this_p = ar_p[this_mask]
        ac, ac_s, ac_u, ssd, spod = calc_agree_coef(this_t, this_p, this_t.mean(), this_p.mean())
        rmspe = calc_rmspe(this_t, this_p)
        class_stats = {'lc_class': lc, 'aggree_coef': ac, 'AC_sys': ac_s, 'AC_unsys': ac_u, 'rmspe': rmspe}
        stats.append(class_stats)
    
    df = pd.DataFrame(stats).reindex(columns=['lc_class', 'aggree_coef', 'AC_sys', 'AC_unsys', 'rmspe'])
    out_txt = os.path.join(out_dir, 'lc_stats.txt')
    df.to_csv(out_txt, sep='\t', index=False)
    
    out_png = os.path.join(out_dir, 'agreement_per_lc_no0.png')
    class_labels = ['Water', 'Ice/Snow', 'Developed', 'Bare Ground', 'Deciduous Forest', 'Coniferous Forest', 'Shrubland']
    plot_agreement(df, out_png, class_labels=class_labels)
    
    return df    
Beispiel #6
0
def main():
    _dir = os.path.dirname(__file__)
    _path = glob.glob(_dir +
                      '//..//data_and_results//HYP_CELL_NWB//Naive//*.nwb')
    full_qc = [0, 0, 0, 0]
    for fp in _path:
        realX, realY, realC = loadNWB(fp)
        temp_qc = run_qc(realY, realC)
        full_qc = np.vstack((full_qc, temp_qc))
    df = pd.DataFrame(
        data=full_qc[1:, :],
        columns=['Mean RMS', 'Max RMS', 'Mean Drift', 'Max Drift'],
        index=_path)
    df.to_csv('qc.csv')
    stats = []
    for col in df.columns.values:
        stats.append(df[col].quantile(0.1))
    qc_stats = pd.DataFrame(data=stats,
                            index=[
                                '10 percentile Mean RMS',
                                '10 percentile Max RMS',
                                ' 10 percentile Mean Drift',
                                ' 10 percentile Max Drift'
                            ])
    qc_stats.to_csv('qc_stats.csv')
def bootstrap_CI(sample, conf=0.95):
    '''TODO: finish this - make it work, but also decide if this is the right algorithm to use
    default alpha 0.95'''
    values = sample.values
    # configure bootstrap
    n_iterations = 1000
    n_size = int(len(sample) * 0.50)  # ?
    print(n_size)
    # run bootstrap
    stats = list()
    for i in range(n_iterations):
        # prepare train and test sets
        train = resample(values, n_samples=n_size)
        test = np.array(
            [x for x in values if x.tolist() not in train.tolist()])
        # fit model
        model = DecisionTreeClassifier()
        model.fit(train[:, :-1], train[:, -1])
        # evaluate model
        predictions = model.predict(test[:, :-1])
        score = accuracy_score(test[:, -1], predictions)
        print(score)
        stats.append(score)

    # confidence intervals
    alpha = 0.95
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(stats, p))
    p = (alpha + ((1.0 - alpha) / 2.0)) * 100
    upper = min(1.0, np.percentile(stats, p))
    print('%.1f confidence interval %.1f%% and %.1f%%' %
          (alpha * 100, lower * 100, upper * 100))
def compute_stats(output, labels):

    stats = []

    for c in range(labels.shape[1]):
        # Average precision
        avg_precision = metrics.average_precision_score(
            labels[:, c], output[:, c], average=None)

        # AUC
        auc = metrics.roc_auc_score(labels[:, c], output[:, c], average=None)

        # Precisions, recalls
        (precisions, recalls, thresholds) = metrics.precision_recall_curve(
            labels[:, c], output[:, c])

        # FPR, TPR
        (fpr, tpr, thresholds) = metrics.roc_curve(labels[:, c], output[:, c])

        save_every_steps = 1000  # Sample statistics to reduce size
        dict = {'precisions': precisions[0::save_every_steps],
                'recalls': recalls[0::save_every_steps],
                'AP': avg_precision,
                'fpr': fpr[0::save_every_steps],
                'fnr': 1. - tpr[0::save_every_steps],
                'auc': auc}
        stats.append(dict)

    return stats
Beispiel #9
0
    def compute(self, inp=None):
        stats = []
        layer_stats = [None, None]

        if self.interface.do_center == False:
            oldpos = np.copy(self.interface.universe.atoms.positions)
            self.interface.center()

        surface = self.interface._surfaces[0]
        surface.triangulation()

        if self.return_statistics is True:
            for side in [0, 1]:
                layer_stats[side] = utilities.triangulated_surface_stats(
                    surface.trimmed_surf_triangs[side],
                    surface.triangulation_points[side])
            # this average depends on what is in the stats, it can't be done
            # automatically
            stats.append(layer_stats[0][0] + layer_stats[1][0])
            # add here new stats other than total area

        if self.interface.do_center == False:
            self.interface.universe.positions = np.copy(oldpos)

        if self.return_triangulation is False:
            return stats
        else:
            return [
                stats, surface.surf_triang, surface.triangulation_points,
                surface.trimmed_surf_triangs
            ]
Beispiel #10
0
def do_json(args):
    versions = {}
    for path in args.logdirs:
        if os.path.isdir(path):
            for root, dirs, files in os.walk(path):
                version = os.path.basename(root)
                if version not in versions: versions[version] = {}
                for filename in files:
                    if filename.endswith(".txt"):
                        m = re.match(r'^([^#]+)(#.*)?\.txt$', filename)
                        domain = m.group(1)
                        if domain not in versions[version]:
                            versions[version][domain] = {}
                        read_stats(os.path.join(root, filename),
                                   versions[version][domain], args)
    for version, domains in versions.items():
        if args.aggregate:
            create_total_page_stats(domains, args)
        for domain, entries in domains.items():
            stats = []
            for name, value in entries.items():
                # We don't want the calculated sum in the JSON file.
                if name == "Sum": continue
                entry = [name]
                for x in ['time_list', 'count_list']:
                    s = statistics(entries[name][x])
                    entry.append(round(s['average'], 1))
                    entry.append(round(s['ci']['abs'], 1))
                    entry.append(round(s['ci']['perc'], 2))
                stats.append(entry)
            domains[domain] = stats
    print json.dumps(versions, separators=(',', ':'))
Beispiel #11
0
def load_patches_db(path):
    masks = []
    stats = []
    nuclei_list = []
    files = os.listdir(path)
    patches1 = len(np.load(os.path.join(path, files[0]),encoding="latin1",allow_pickle=True))   
    patches2 = len(np.load(os.path.join(path, files[1]),encoding="latin1",allow_pickle=True))  
    patches3 = len(np.load(os.path.join(path, files[2]),encoding="latin1",allow_pickle=True))   
    patches4 = len(np.load(os.path.join(path, files[3]),encoding="latin1",allow_pickle=True)) 
    patches5 =len(np.load(os.path.join(path, files[4]),encoding="latin1",allow_pickle=True))  
    patches6 = len(np.load(os.path.join(path, files[5]),encoding="latin1",allow_pickle=True))  
    #(print(patches1,'',patches2,'',patches3,'',patches4,'',patches5,'',patches6))
    images = np.zeros((patches6+patches1+patches2+patches3+patches4+patches5,32,32,3))  
    c = 0
    i=0
    for f in files:
        patches = np.load(os.path.join(path, f),encoding="latin1",allow_pickle=True) 
        for patch in patches:
            image, mask, nuclei, stat = patch
            i+=1
            images[c] = image[:,:,:3] 
            masks.append(mask)
            stats.append(stat)
            nuclei_list.append(nuclei)
            c += 1
    return images, masks, nuclei_list, stats
def get_stats():
    # make a list to hold tuples of features from each file. We'll use this to build the dataframe
    stats = []

    # loop over each path and extract our features
    for path in PATHS:
        with open(path, 'r') as f:
            # the child class is indicated by which folder inside data it's in
            child_class = path.split('/')[2]
            # read the whole file
            data = f.read()
            # get the individual lines (could have done readlines, but we need the whole text too)
            lines = data.split('\n')
            # get the participants for this file
            participants = get_target_participants(data)

            # loop over the participants and collect features for each of them
            for participant in participants:
                # initialize collector variables
                tokens = []
                tags = []
                num_tokens = 0
                num_utterances = 0
                stage = get_stage(path)

                # loop over the lines
                for i, line in enumerate(lines):
                    # when we fine a line with our participant, get the tokens and tags
                    if f'*{participant[0]}:' in line:
                        # get the lines containing tag data and extract tags
                        tag_lines = get_tag_lines(lines, i)
                        tags += get_tags(tag_lines)

                        # get any corresponding errors and corrections
                        errors = get_errors(lines, i)
                        
                        # get the lines containing tokens and clean / tokenize
                        token_lines = get_token_lines(lines, i)
                        tokens += get_tokens(token_lines, errors)

                        # increment for each utterance we see
                        num_utterances += 1

                # append the tuple with our data for this participant
                stats.append((path.split('/')[-1],
                              child_class,
                              participant,
                              stage,
                              tokens,
                              Counter(tokens),
                              Counter(tags),
                              len(tokens),
                              len(Counter(tokens)),
                              len(Counter(tags)),
                              len(tokens)/num_utterances))
    
    # make the dataframe from stats and return it
    df = pd.DataFrame(stats, columns=['file_name','child_class', 'child_name', 'stage', 'tokens','types', 'tags', 'num_tokens', 'num_types', 'num_tags', 'mlu'])
    return df
Beispiel #13
0
 def get_coverage_stats(self):
     stats = []
     stats.append(["Average Cvg", self.get_average_coverage()])
     stats.append(["Cvg Std Dev", self.get_coverage_stddev()])
     stats.append(["Median Cvg", self.get_median_coverage()])
     stats.append(["Cvg Mode", self.get_coverage_mode()])
     stats.append(["Cvg Coeff Var", self.get_coefficient_of_variation()])
     return stats
Beispiel #14
0
def explore_statistical_measures_across_repetitions(iterations=5,
                                                    ndatapoints=1000):
    stats = []
    for i in range(iterations):
        data1, data2 = get_two_datasets(ndatapoints)
        s = get_statistical_measures(data1, data2)
        stats.append(s)
    return stats
Beispiel #15
0
def stat_df(df):
    stats = []
    for col in df.columns:
        stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))

    stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
    stats_df.sort_values('Percentage of missing values', ascending=False,inplace=True)
    return stats_df
Beispiel #16
0
 def convert_stat(self, x):
     x = np.atleast_2d(x)
     n = len(x)
     stats = []
     for i in range(n):
         stat = self.problem.sufficient_stat(x[i])
         stats.append(stat)
     s = np.vstack(stats)
     return s
Beispiel #17
0
def test():
    """ test """
    indata = [0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.1, 0.4]

    stats = RollingStats()
    for i in indata:
        stats.append(i)

    print stats.get()
    print calculate_stats(indata)
Beispiel #18
0
def test():
    """ test """
    indata = [0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.1, 0.4]

    stats = RollingStats()
    for i in indata:
        stats.append(i)

    print stats.get()
    print calculate_stats(indata)
def calculate_stats(output, target, threshold):
    """Calculate statistics including mAP, AUC, etc.

	Args:
	  output: 3d array, (samples_num, time, classes_num)
	  target: 3d array, (samples_num, time, classes_num)

	Returns:
	  stats: list of statistic of each class.
	"""

    classes_num = target.shape[0]
    timestep_num = target.shape[1]
    print(classes_num)
    print(timestep_num)
    stats = []

    # Class-wise statistics
    for j, k in [(j, k) for j in range(timestep_num)
                 for k in range(classes_num)]:

        # Piecewise comparison
        output_rounded = output > threshold

        # Average precision
        avg_precision = metrics.average_precision_score(target[:, j, k],
                                                        output_rounded[:, j,
                                                                       k],
                                                        average=None)

        # AUC
        #auc = metrics.roc_auc_score(target[:, j, k], output_rounded[:, j, k], average=None)

        # Precisions, recalls
        (precisions, recalls,
         thresholds) = metrics.precision_recall_curve(target[:, j, k],
                                                      output_rounded[:, j, k])

        # FPR, TPR
        (fpr, tpr, thresholds) = metrics.roc_curve(target[:, j, k],
                                                   output_rounded[:, j, k])

        save_every_steps = 1000  # Sample statistics to reduce size
        dict = {
            'precisions': precisions[0::save_every_steps],
            'recalls': recalls[0::save_every_steps],
            'AP': avg_precision,
            'fpr': fpr[0::save_every_steps],
            'fnr': 1. - tpr[0::save_every_steps]
        }
        #'auc': auc}
        stats.append(dict)

    return stats
Beispiel #20
0
 def get_stats(self):
     stats = []
     for current_id in self.ids:
         current_rating = self.ratings[current_id]
         mu = current_rating.mu
         if self.float_precision is not None and isinstance(
                 self.float_precision, int):
             mu = truncate_float(mu, self.float_precision)
         c = (self.names[current_id], mu, current_rating.sigma)
         stats.append(c)
     return stats
def get_stats(y):
    stats = []
    for i in range(len(y)):
        if y[i] > -10000:
            stats.append(y[i])

    mean = np.mean(stats)
    medi = np.median(stats)
    std = np.std(stats)

    return mean, medi, std
Beispiel #22
0
def getSRs():
    from smodels.experiment.databaseObj import Database
    db = Database ( "official" )
    ers = db.getExpResults( dataTypes=[ "efficiencyMap" ] )
    stats = []
    for er in ers:
        for ds in er.datasets:
            D = { "obsN": ds.dataInfo.observedN, "expectedBG": ds.dataInfo.expectedBG,
                  "bgError": ds.dataInfo.bgError, "upperLimit": ds.dataInfo.upperLimit,
                  "expectedUpperLimit": ds.dataInfo.expectedUpperLimit }
            stats.append ( D )
    return stats
Beispiel #23
0
def bootstrap_func(df, group_col, y):
    n_iterations = 1000
    n_size = int(len(df) * 0.50)
    stats = list()
    for i in range(n_iterations):
        sample = resample(df, n_samples=n_size)
        mean_dict = {grp[0]:grp[1][y].mean() for grp in sample.groupby(group_col)}
        stats.append(mean_dict['treatment']-mean_dict['control'])
    est_diff = sum(stats)/len(stats)
    lower = np.percentile(stats, 0.025)
    upper = np.percentile(stats, 97.5)
    return({'est':est_diff, 'lower':lower, 'upper':upper})
Beispiel #24
0
    def validation_end(self, outputs):

        truth = torch.cat([o['batch_truth'] for o in outputs], dim=0).reshape(-1)
        logits = torch.cat([o['batch_logits'] for o in outputs], dim=0).reshape(truth.shape[0],
                                                                                outputs[0]['batch_logits'].shape[1])
        loss_sum = torch.cat([o['batch_loss'].reshape(-1) for o in outputs], dim=0).reshape(-1)
        loss_sum = torch.sum(loss_sum, dim=0).reshape(-1)

        assert truth.shape[0] == sum([o['batch_logits'].shape[0] for o in outputs]), "Mismatch size"

        loss = self.loss(truth, logits)

        assert math.isclose(loss.item(), loss_sum.item(),
                            abs_tol=0.01), f"Loss not equal: {loss.item()} VS. {loss_sum.item()}"

        loss /= truth.shape[0]
        loss_sum /= truth.shape[0]

        proba = F.softmax(logits, dim=-1)
        pred = torch.argmax(proba, dim=-1).reshape(-1)

        with open(os.path.join(self.hparams.output_dir, "dev-labels.lst"), "w") as output_file:
            output_file.write("\n".join(map(str, (truth + self.task_config[self.hparams.task_name][
                'label_offset']).cpu().numpy().tolist())))

        with open(os.path.join(self.hparams.output_dir, "dev-predictions.lst"), "w") as output_file:
            output_file.write("\n".join(
                map(str, (pred + self.task_config[self.hparams.task_name]['label_offset']).cpu().numpy().tolist())))

        with open(os.path.join(self.hparams.output_dir, "dev-probabilities.lst"), "w") as output_file:
            output_file.write("\n".join(map(lambda l: '\t'.join(map(str, l)), proba.cpu().detach().numpy().tolist())))

        stats = []
        predl = pred.cpu().detach().numpy().tolist()
        truthl = truth.cpu().detach().numpy().tolist()

        for _ in range(100):
            predl = pred.cpu().detach().numpy().tolist()

            indicies = np.random.randint(len(predl), size=len(predl))
            sampled_pred = [predl[i] for i in indicies]
            sampled_truth = [truthl[i] for i in indicies]
            stats.append(accuracy_score(sampled_truth, sampled_pred))

        _, lower, upper = mean_confidence_interval(stats, self.hparams.ci_alpha)

        return {
            'val_loss': loss.item(),
            'val_acc': accuracy_score(truth.cpu().detach().numpy().tolist(), pred.cpu().detach().numpy().tolist()),
            'val_cil': lower,
            'val_ciu': upper,
        }
 def getStats(self):
     '''Gets statistics of the recovered parameters'''
     for idx in arange(len(self.recparams)):
         data = self.chain[idx]
         stats.append([
             min(data),
             max(data),
             mean(data),
             stdev(data),
             getConfidenceIntervals(
                 hist(data, 1000)[0], linspace(min(data), max(data), 1000),
                 [68.0, 95.0, 99.0])
         ])
Beispiel #26
0
 def test(self, sample1: np.ndarray, sample2: np.ndarray, alpha=0.05):
     iters = self.options.get('iters', 1000)
     gt = compute_pr_x_ge_y(sample1, sample2)
     sample = np.concatenate((sample1, sample2))
     n = len(sample1)
     stats = []
     for _ in range(iters):
         np.random.shuffle(sample)
         sample1 = sample[:n]
         sample2 = sample[n:]
         stats.append(compute_pr_x_ge_y(sample1, sample2))
     p = np.mean(np.array(stats) <= gt)
     return p < alpha, p, p
Beispiel #27
0
def getStatsForDataframe(df):
    # https://www.kaggle.com/artgor/is-this-malware-eda-fe-and-lgb-updated
    stats = []
    for col in df.columns:
        stats.append(
            (col, df[col].nunique(),
             df[col].isnull().sum() * 100 / df.shape[0],
             df[col].value_counts(normalize=True, dropna=False).values[0] *
             100, df[col].dtype))

    stats_df = pd.DataFrame(
        stats,
        columns=['Feature', 'Unique_values', '%Missing', '%Biggest', 'type'])
    return stats_df.sort_values('%Missing', ascending=False)
Beispiel #28
0
def record(n_iters: int, outfile: str, command: List[str]) -> None:
    stats: List[Stat] = []
    for i in range(n_iters):
        print(f'\r[{i+1} / {n_iters}] Running {command}', end='', flush=True)
        stats.append(perf_stat(command))

    with open(outfile, 'w', encoding='utf-8') as f:
        f.write(
            'cycles\tinstructions\tseconds_elapsed\tseconds_user\tseconds_sys\n'
        )
        for s in stats:
            f.write('\t'.join(s) + '\n')

    print(f'\nResults written to {outfile}.')
Beispiel #29
0
def calculate_stats(output, target):
    """Calculate statistics including mAP, AUC, etc.

    Args:
      output: 2d array, (samples_num, classes_num)
      target: 2d array, (samples_num, classes_num)

    Returns:
      stats: list of statistic of each class.
    """

    classes_num = target.shape[-1]
    stats = []

    # Class-wise statistics
    for k in range(classes_num):

        # Average precision
        avg_precision = metrics.average_precision_score(target[:, k],
                                                        output[:, k],
                                                        average=None)

        # AUC
        try:
            auc = metrics.roc_auc_score(target[:, k],
                                        output[:, k],
                                        average=None)
        except:
            pass

        # Precisions, recalls
        (precisions, recalls,
         thresholds) = metrics.precision_recall_curve(target[:, k], output[:,
                                                                           k])

        # FPR, TPR
        (fpr, tpr, thresholds) = metrics.roc_curve(target[:, k], output[:, k])

        save_every_steps = 1000  # Sample statistics to reduce size
        dict = {
            'precisions': precisions[0::save_every_steps],
            'recalls': recalls[0::save_every_steps],
            'AP': avg_precision,
            'fpr': fpr[0::save_every_steps],
            'fnr': 1. - tpr[0::save_every_steps]
            #'auc': auc
        }
        stats.append(dict)

    return stats
Beispiel #30
0
def calc_missing_stat(df, missing_only=True):
    stats = []
    for col in df.columns:
        stats.append(
            (col, df[col].nunique(),
             df[col].isnull().sum() * 100 / df.shape[0], df[col].dtype))

    stats_df = pd.DataFrame(
        stats, columns=['feature', 'num_of_unique', 'pct_of_missing', 'type'])
    stats_df = stats_df.sort_values('pct_of_missing',
                                    ascending=False).reset_index(drop=True)
    if missing_only: stats_df = stats_df[stats_df['pct_of_missing'] > 0]

    return stats_df
def matrix_stats(X, n=6):
    '''
    Order of stats: mean, median, skewness, kurtosis, standard dev,sumofmax,sumofmin,RMS,interquartile_range, l-scale, l-skewness,lkurtosis
    ^that is if n=4, if not 4 the order will be the same but additonal L-moments will added to the end, 1 per each value of n that increases
    Input: Matrix of rows of data 
    Output: 'summary' of each in row vectors joined into a matrix,
    basically a matrix of stats summarizing the data matrix
    '''
    X = np.array(X)
    stats = []
    try:
        for x in X:
            stats.append(onerowstats(x, n))
    except TypeError:
        stats.append(onerowstats(X, n))
    return np.array(stats)
def bootstrappedConfidenceInterval(
      trials, computeStatisticFunction,
      numResamples, pValue):
    stats = []
    t = time.time()
    for i in range(0, numResamples):
        # This is just to know when it'll be finished when it runs on a laptop.
        update(i,numResamples,t)
        
        resample = trials.sample(True, 1.0)
        stats.append(computeStatisticFunction(resample))
        
    sorted(stats)
    lowerIndex = int(numResamples * pValue / 2 - 1)
    upperIndex = int(np.ceil(numResamples * (1 - pValue / 2)))
    return (stats[lowerIndex], stats[upperIndex])
Beispiel #33
0
def answerCDF(api_site_name):

    try:
        key = bucket.get_key("QA/%s.json.zlib"%api_site_name)
        items = json.loads(zlib.decompress(key.get_contents_as_string()))
    except:
        "print data not found on S3. Please crawl data from stackexchange website first"
        return

    A = []
    S = []
    Sa = []
    rank = []
    score = []
    time = []

    for item in items:
        a = 0
        s = 0
        t0 = item['creation_date']


        if item.has_key('answers'):
            for i,answer in enumerate(item['answers']):

                a += 1
                dt = answer['creation_date'] - t0
                time.append(dt)

                score.append(answer['score'])
                s += answer['score']

                rank.append(i+1)

                #t0 = answer['creation_date']
            A.append(a)
            Sa.append(s)
            S.append(item['score'])

    return {'rank' : np.array(rank),
            'score' : np.array(score),
            'time' : np.array(time),
            'A': np.array(A),
            'S': np.array(S),
            'Sa': np.array(Sa),
            }
    def run_expts(self, config: SchedulerConfig, num_srvs: int, num_expts: int,
                  seed_num: int):
        """Runs a number of experiments with the specified configuration.

        Args:
            config: The configuration the Scheduler within the experiments.
            num_srvs: The total number of servers.
            num_expts: The number of experiements to be run.
            seed_num: A seed used to update the job generator.

        Returns:
            list: A list of scheduling statistics.
        """
        stats = []
        for i in range(num_expts):
            expt_stats = self._run_expt(config, num_srvs, seed_num + i)
            stats.append(expt_stats)
        return stats
Beispiel #35
0
def loop_show_asymmetry(prefix,
                        grouping_keys=['Gender', 'FDH_23_Handedness_Prtcpnt'],
                        xaxis_key='Age_At_IMGExam',
                        plots='regressions'):
    """ Loop over all properties to show asymmetry."""
    data = get_all_data()
    data.filter(lambda k, v: 'fuzzy' not in k)  # Remove 'fuzzy'
    data.filter([partial(lambda k, v, p: (k.startswith(p) or
                                          k in grouping_keys or
                                          k == xaxis_key),
                         p=p)
                 for p in prefix])

    # Process & plot the data.
    stats = []
    regressions = []
    group_samples = []
    measure_keys = data.get_twohemi_keys()
    for pi, key in enumerate(sorted(measure_keys)):
        print("Comparing %d (%s)..." % (pi, key))
        gn, ss, rv, gs = compare_group_asymmetry(data.data_dict, xaxis_key=xaxis_key,
                                                 yaxis_key=key, plots=plots,
                                                 grouping_keys=grouping_keys,
                                                 measure_key=key)
        stats.append(ss)
        regressions.append(rv)
        group_samples.append(gs)

    if 'regression_stats' in plots:
        dump_regressions_csv(regressions,
                             group_names=gn,
                             measure_names=measure_keys)

        plot_regressions_scatter(regressions,
                                 group_names=gn, 
                                 measure_names=measure_keys)

    if 'stat_distributions' in plots:
        plot_stat_distributions(stats, group_names=gn)

    plt.show()
Beispiel #36
0
def test_search2():
#    Test that the search region works.

    search = rft.IntrinsicVolumes([3,4,5])
    x = np.linspace(0.1,10,100)

    stats = [rft.Gaussian(search=search)]
    ostats = [rft.Gaussian()]

    for dfn in range(5,10):
        for dfd in [40,50,np.inf]:
            stats.append(rft.FStat(dfn=dfn, dfd=dfd, search=search))
            ostats.append(rft.FStat(dfn=dfn, dfd=dfd))
            stats.append(rft.TStat(dfd=dfd, search=search))
            ostats.append(rft.TStat(dfd=dfd))
        stats.append(rft.ChiSquared(dfn=dfn, search=search))
        ostats.append(rft.ChiSquared(dfn=dfn))

    for i in range(len(stats)):
        stat = stats[i]
        ostat = ostats[i]
        v1 = stat(x)
        v2 = 0

        for j in range(search.mu.shape[0]):
            v2 += ostat.density(x, j) * search.mu[j]
        assert_almost_equal(v1, v2)
Beispiel #37
0
    def _test_binary_predictor(self, features, responses):
        final_stats = []
        final_probs = []
        final_responses = []

        for predictors_array, ct_prefix, celltype_idx in [(self.ct1_predictor, 'ct1', 0), (self.ct2_predictor, 'ct2', 1)]:

            echo('Constructing examples for testing for ' + ct_prefix)
            stats = []
            probs = []
            resp = []
            formatted_features, formatted_responses = \
                self.format_features_and_responses(features={'primary_mark_signal': features[ct_prefix + '_mark_signal']},
                                                   responses=responses[ct_prefix + '_target_regions'])

            tp, fp, tn, fn, p = self._eval_predictor(predictors_array,
                                                     formatted_features,
                                                     formatted_responses,
                                                     return_probs=True,
                                                     celltype_idx=celltype_idx,
                                                     bagged_idx='all')
            stats.append((tp, fp, tn, fn))
            probs.extend(p)
            resp.extend(formatted_responses)

            echo('Global Evaluation')

            print_eval(*map(sum, zip(*stats)))

            if len(resp) != len(probs):
                print 'ERROR', len(resp), len(probs)
                exit(1)

            final_stats.append(stats)
            final_probs.append(probs)
            final_responses.append(resp)

        return final_stats, final_probs, final_responses
Beispiel #38
0
def calc_multigof(data, model):
	#choice_limit = 1e-2 / len(data)
	#choices = numpy.array([[scipy.stats.poisson(m).pmf(i)
			#for i in range(
				#int(scipy.stats.poisson(m).ppf(choice_limit)),
				#int(scipy.stats.poisson(m).ppf(1 - choice_limit) + 1))]
					#for m in model])
	
	stats = []
	for i in range(20):
		n = 4**i
		if n > len(data):
			break
		
		for j in range(int(numpy.ceil(len(data) * 1. / n))):
			dpart = data [j*n:(j + 1)*n]
			mpart = model[j*n:(j + 1)*n]
			#cpart = choices[j*n: (j + 1)*n]
			#print data.shape, dpart.shape, j*n, (j + 1)*n, dpart, data
			k = int(dpart.sum()) if len(dpart) > 0 else 0
			m = mpart.sum()
			
			# compare this probability to all the other k
			#probs = gen_choices(cpart, k)
			if m > 0:
				probs = scipy.stats.poisson(m).pmf(k)
			else:
				if k == 0:
					probs = 1
				else:
					probs = 1e-10
			stats.append([n, j, probs, m, k]) #  * len(data) / n])
			assert not numpy.isnan(probs), [mpart.sum(), k]
			#print '  multigof', n, j, probs, len(stats)
			# lam = sum(mpart) WRONG!
			# go through all possibilities to get k
	return numpy.array(stats)
Beispiel #39
0
def test_search1():
    # Test that the search region works.
    # XXX - we are not testing anything
    search = rft.IntrinsicVolumes([3,4,5])
    x = np.linspace(0.1,10,100)
    stats = [rft.Gaussian()]
    for dfn in range(5,10):
        for dfd in [40,50,np.inf]:
            stats.append(rft.FStat(dfn=dfn, dfd=dfd))
            stats.append(rft.TStat(dfd=dfd))
        stats.append(rft.ChiSquared(dfn=dfn))
    for dim in range(7):
        for stat in stats:
            # XXX - v1 appears to be unused
            v1 = stat(x, search=search)
            v2 = 0
            for i in range(search.mu.shape[0]):
                v2 += stat.density(x, i) * search.mu[i]
for i in range(nQual):
    dataT89.append(results[i]['Lgm_B_T89']['time'])
dataT89 = array(dataT89)
dataOP77 = []
for i in range(nQual):
    dataOP77.append(results[i]['Lgm_B_OP77']['time'])
dataOP77  = array(dataOP77)

data = hstack([dataT89.transpose(), dataOP77.transpose()])
data = hstack([data[0:,0::2], data[0:,1::2]]) # T89 and OP77 alternate now

# compute statistics on if the medians are different in the two runs
stats = []
for val in range(nQual):
    # for each pair of data do the test
    stats.append(scipy.stats.mannwhitneyu(data[:,val*2], data[:,val*2+1])[1]*2) # get p value *2 for 2 sided
for i, val in enumerate(stats):
    if val < 0.05:
        stats[i] = 'Different'
    else:
        stats[i] = 'Same'

figure()
boxplot(data, notch=True, positions=range(nQual*2))
ax = gca()
ax.set_xlabel('Quality number')
ax.set_ylabel('Run time')
ax.set_title(socket.gethostname() + ' LstarVersusPA Calcs ' +
             str(datetime.datetime.now().month) + '-' +
             str(datetime.datetime.now().day) +
             '-' + str(datetime.datetime.now().year))
Beispiel #41
0
 def getStats(self):
   '''Gets statistics of the recovered parameters'''
   for idx in arange(len(self.recparams)):
     data = self.chain[idx]
     stats.append([min(data), max(data), mean(data), stdev(data), getConfidenceIntervals(hist(data, 1000)[0], linspace(min(data),max(data),1000), [68.0,95.0,99.0])])
            c_name = "%s_%s" % (clust, cl)
            clust_col_names.append(c_name)
            coords = clust_coords[clust][cl]
            for ss in subj_list:
                lg.info("get peak for Clust %s, subj %s" % (clust, ss))
                L.append(mask_dump_peak(clust, coords, ss))
            clust_dat = clust_dat.append(pd.Series(L))

    out_dat = pd.DataFrame(clust_dat.reshape(7, 17).T, columns=clust_col_names)
    outname = os.path.join(stdoutdir, "peak_voxel_data.csv")
    out_dat.to_csv(outname, index=False)

lg.info(print(out_dat.corr()))


# Doing the tests
import itertools
import scipy.stats

conds = []
stats = []
for combo in itertools.combinations(range(out_dat.shape[1]), 2):
    conds.append(list([out_dat.columns[combo[0]], out_dat.columns[combo[1]]]))
    stats.append(list(scipy.stats.pearsonr(out_dat.iloc[:, combo[0]], out_dat.iloc[:, combo[1]])))

corr_res = pd.concat([pd.DataFrame(conds), pd.DataFrame(stats)], axis=1)
col_heads = ["condition1", "condition2", "rvalue", "pvalue"]
corr_res.columns = col_heads
outname_corr = os.path.join(stdoutdir, "corr_tests_on_peak_voxel_data.csv")
corr_res.to_csv(outname_corr, index=False)
Beispiel #43
0
def plotPhaseTransition_d05(descDic,percentile=25,plot=False):
    
    resultDir = "/Users/maithoma/work/compute/pgames_d05_transition/results/"
    #print listRootFilenames()
    
    rt_fnames = selectRootFilenames(descDic)['list_rt']
    rt_variables = selectRootFilenames(descDic)['var_dic']
    
    S = []
    C = []
    s = []
    cMedian = []
    cDown = []
    cUp = []
    cCountUp = []
    cCountDown = []
    cCountMiddle = []
    
    for r,rt in enumerate(rt_fnames):
        i=0
        c = []
        s.append(rt_variables['s'][r])
        while True:
            try:
                filename = rt + "_%s.csv"%i
                #print filename
                coop = parseSummary(filename)['coop_level'][-1]
                #print filename,coop
                c = np.append(c,coop)
                S.append(rt_variables['s'][r])
                C.append(coop)
                i+=1
            except IOError:
                break
        print rt,c
        cMedian.append(np.mean(c))
        cDown.append(np.percentile(c,percentile))
        cUp.append(np.percentile(c,100 - percentile))
        cCountUp.append(len(c[c>0.8])/float(len(c)))
        cCountDown.append(len(c[c<0.2])/float(len(c)))
        cCountMiddle.append(len(c[(c>=0.2)*(c<=0.8)])/float(len(c)))
        
    dic = {'s': s, 'cMedian': cMedian,'cUp' : cUp,'cDown' : cDown,'C' :C, 'S':S, 'cCountDown' :cCountDown, 'cCountUp' :cCountUp,'cCountMiddle' :cCountMiddle}
    
    if plot:
        pl.close("all")
        pl.figure(1)
        
        #pl.plot(dic['s'],dic['cCountDown'],'r-+',lw=1)
        #pl.plot(dic['s'],dic['cCountUp'],'g-x',lw=1)
        #pl.plot(dic['s'],dic['cCountMiddle'],'b-')
        pl.xlabel("Property Violation s")
        #pl.ylabel("Probability that cooperation wins (green) or disappears (red), \n or intermediary state (blue)")
        pl.ylim(-0.05,1.05)
        #pl.xlim(xmax=0.05)
        
        pl.plot(dic['s'],dic['cMedian'],'k-.',lw=2)
        pl.fill_between(dic['s'],dic['cDown'],dic['cUp'],color='k',alpha=0.2)
        #pl.plot(dic['s'],dic['cUP'],'k-.',lw=2)
        #pl.plot(dic['s'],dic['cDown'],'k-.',lw=2)
        pl.plot(S,C,'ko')
        pl.xlabel("Property Violation s")
        pl.ylabel("Median Cooperation Level")
        

        

    return dic
Beispiel #44
0
def ttest(dsets, sa_labels=None, return_values='mt',
          set_NaN_to=0., compare_to=0.):
    '''Runs a one-sample t-test across datasets

    Parameters
    ----------
    dsets: str or list of dicts
        (filenames of) NIML dsets, each referring to PxQ data for
        P nodes (features) and Q values per node (samples)
    sa_labels: list of (int or str)
        indices or labels of columns to compare
    return_values: str (default: 'mt')
        'm' or 't' or 'mt' to return sample mean, t-value, or both
    set_NaN_to: float or None (default: 0.)
        the value that NaNs in dsets replaced by. If None then NaNs are kept.
    compare_to: float (default: 0.)
        t-tests are compared against the null hypothesis of a mean of
        compare_to.

    Returns
    -------
    dset: dict
        NIML dset-compatible dict with fields 'data', 'labels',
        'stats' and 'node_indices' set.
    '''

    do_m = 'm' in return_values
    do_t = 't' in return_values

    if not (do_m or do_t):
        raise ValueError("Have to return at least m or t")

    ns = len(dsets)

    for i, dset in enumerate(dsets):
        dset = from_any(dset)
        dset_data = dset['data']
        if i == 0:
            sh = dset_data.shape
            if sa_labels is None:
                if 'labels' in dset:
                    sa_labels = dset['labels']
                    dset_labels = sa_labels
                else:
                    sa_labels = range(sh[1])
                    dset_labels = ['%d' % j for j in sa_labels]
            else:
                dset_labels = sa_labels


            nc = len(dset_labels) if dset_labels else sh[1]
            nn = sh[0]

            data = np.zeros((nn, nc, ns), dset_data.dtype) # number of nodes, columns, subjects

        if 'node_indices' in dset:
            node_idxs = np.reshape(dset['node_indices'], (-1,))
        else:
            node_idxs = np.arange(nn)

        if i == 0:
            node_idxs0 = node_idxs
        else:
            if set(node_idxs0) != set(node_idxs):
                raise ValueError("non-matching node indices for %d and %d" %
                                    (0, i))

        col_idxs = np.asarray(label2index(dset, sa_labels))

        data[node_idxs, :, i] = dset_data[:, col_idxs]

    # subtract the value it is compared to
    # so that it now tests against a mean of zero
    if do_m:
        m = np.mean(data, axis=2)

    if do_t:
        from scipy import stats
        t = stats.ttest_1samp(data - compare_to, 0., axis=2)[0]

    if do_m and do_t:
        r = np.zeros((nn, 2 * nc), dtype=m.dtype)
        r[:, np.arange(0, 2 * nc, 2)] = m
        r[:, np.arange(1, 2 * nc, 2)] = t
    elif do_t:
        r = t
    elif do_m:
        r = m

    pf = []
    stats = []
    if do_m:
        pf.append('m')
        stats.append('None')
    if do_t:
        pf.append('t')
        stats.append('Ttest(%d)' % (ns - 1))

    labs = sum([['%s_%s' % (p, lab) for p in pf] for lab in dset_labels], [])
    stats = stats * nc

    if not set_NaN_to is None:
        r[np.logical_not(np.isfinite(r))] = set_NaN_to


    return dict(data=r, labels=labs, stats=stats, node_indices=node_idxs0)
Beispiel #45
0
def build_ffnet(sorted_data,training_set_size):

	logging.info('starting new run! -----------------------------')
	print 'defining network'

	from ffnet import ffnet, imlgraph, mlgraph, loadnet, savenet
	from time import time
	from multiprocessing import cpu_count
	import networkx
	import pylab

	#data_in_training2 = sorted_data[:training_set_size,10:-2].astype(float).tolist()
	data_target_training2 = [[i] for i in sorted_data[:training_set_size,0].astype(float)]

	new_data_in = sorted_data[:training_set_size,col_training_set[0]]
	for i in col_training_set[1:]:
		new_data_in = numpy.column_stack((new_data_in, sorted_data[:training_set_size,i]))
	data_in_training2 = new_data_in.astype(float).tolist()

	# Define net (large one)
	conec = mlgraph(network_config, biases=False) #skipping first 11 cols
	net = ffnet(conec)
	print 'saving initialized net'
	savenet(net, 'starting_net.n')
	#net = loadnet('starting_net.n') # this way we can init a complex net just once

	#print 'draw network'
	#networkx.draw_graphviz(net.graph, prog='dot')
	#pylab.show()

	graph_weekly(net, sorted_data,training_set_size) # just saving a pic

	logging.info('network built as: ' + str(network_config) )

	print "TRAINING NETWORK..."
	# that are many different training algos

	#net.train_rprop(data_in_training2, data_target_training2, a=1.9, b=0.1, mimin=1e-06, mimax=15.0, xmi=0.5, maxiter=max_functions, disp=1)
	###net.train_momentum(data_in_training2, data_target_training2, eta=0.2, momentum=0.1, maxiter=max_functions, disp=1)
	stats = []
	smallest_error = 1000
	total = 0
	try:
		for i in xrange(min_loops,max_loops):
			total += max_functions+i
			if total>max_total:
				break
			print 'training for:',max_functions+i, "total is:", total

			net.train_tnc(data_in_training2, data_target_training2, maxfun = max_functions+i, messages=1)
			#net.train_rprop(data_in_training2, data_target_training2, a=1.2, b=0.5, mimin=1e-06, mimax=50.0, xmi=0.1, maxiter=max_functions*20, disp=1)

			graph_weekly(net, sorted_data,training_set_size) # just saving a pic

			in0, out0, s1, s2, mape_weekly_all = calc_stats(net,sorted_data,training_set_size)
			stats.append((in0, out0,total, s1, s2, mape_weekly_all))
			#if out0<=(biggest/1.4) and in0>.7:
			#if out0<=(smallest_error/4) and in0>overfitting_threshold:
			#	print 'we hit overfitting threshold - breaking out early'
			#	break
			if mape_weekly_all < smallest_error: # found a new best
				smallest_error = mape_weekly_all
				savenet(net, 'best_net.n')
	except KeyboardInterrupt: # this way command-c just breaks out of this loop
		pass


	#net.train_cg(data_in_training2, data_target_training2, maxiter=max_functions, disp=1)
	#net.train_genetic(data_in_training2, data_target_training2, individuals=max_population, generations=max_functions)
	#net.train_bfgs(data_in_training2, data_target_training2, maxfun = max_functions, disp=1)
	stats = sorted(stats, reverse=True, key=lambda x: x[1])
	for i in stats:
		temp_string = ''
		for x in i:
			temp_string += str(x) + ','
		print temp_string

	net = loadnet('best_net.n')
	return net