Exemple #1
0
def xanes_fitting_Line(im_stack, e_list, refs, method="NNLS", alphaForLM=0.05):
    """Linear combination fit of image data with reference standards"""
    en, im1, im2 = np.shape(im_stack)
    im_array = np.mean(im_stack, 2)
    coeffs_arr = []
    meanStats = {
        "R_Factor": 0,
        "R_Square": 0,
        "Chi_Square": 0,
        "Reduced Chi_Square": 0
    }

    for i in range(im1):
        stats, coeffs = xanes_fitting_1D(im_array[:, i],
                                         e_list,
                                         refs,
                                         method=method,
                                         alphaForLM=alphaForLM)
        coeffs_arr.append(coeffs)
        for key in stats.keys():
            meanStats[key] += stats[key]

    for key, vals in meanStats.items():
        meanStats[key] = np.around((vals / im1), 5)

    return meanStats, np.mean(coeffs_arr, axis=0)
Exemple #2
0
	def reportStats(self,stats,name,sortby,n):
		# format stats defined in 'stats' dictionary into an output string. 'sortby' specifies how 
		# the output should be ranked. 'n' is number of entries to print (specify an integer or 'all')
		sortbyDict   = self.getStatsForList([1,1])
		headerOrder  = sortbyDict['returnOrder']
		headerString = '{}\t\t{}\t\t{}\t\t{}\t\t{}\t\t{}\t\t{}\n'.format(name,*headerOrder)
		if sortby not in sortbyDict.keys(): return 'Unexpected ranking specified'
		
		list1,list2 = [],[]
		for key in stats.keys():
			statsDic = stats[key]
			statsFmtd = []		
			for key2 in headerOrder: 
				if isinstance(statsDic[key2],float): statsFmtd.append('{0:.3f}'.format(statsDic[key2]))
				else: statsFmtd.append(str(statsDic[key2]))
			string = '\t\t'.join([str(key)]+statsFmtd)
			list1.append(string)
			list2.append(statsDic[sortby])

		sortedList1 = [x for (y,x) in sorted(zip(list2,list1))] # sort by chosen 'sortby' parameter
		if n != 'all':
			stringOut =	'\n'.join(sortedList1[:n])	
		else:
			stringOut =	'\n'.join(sortedList1)	
		return headerString+stringOut
Exemple #3
0
def grid_search_own_metrics_class_stratified(ix,iy,stratus_list,
                                             clf,get_grid_hyperparams_kargs,
                                         regressor_name="Regressor",
                                         show_progress_percentage=0.1, kfold=5,shuffle=True,
                                            sort_report_by='roc_auc_score'):
    clfks = get_grid_hyperparams(**get_grid_hyperparams_kargs)
    report_data = []
    hpks = list(clfks[0].keys())
    cols = len(clfks)
    progress_int = max(int(round(cols*show_progress_percentage,0)),1)
    print("Total number of evaluations:{}".format(cols))
    for col,clfk in enumerate(clfks):
        metrics = cv_metrics_stratified_class(ix, iy.flatten(), stratus_list=stratus_list,
                                              clf=clf, clfk=clfk, kfold=kfold,shuffle=shuffle)
        metrics_report = {'name':regressor_name}
        for m in metrics.keys():
            stats = metrics_stats(metrics[m],rn=3)
            for sk in stats.keys():
                metrics_report[m+"_"+sk]=stats[sk]
        metrics_report.update(clfk)
        report_data.append(metrics_report.copy())
        if col%progress_int==0:
            progress = round(100*col/cols,0)
            print("{} %".format(progress))
    print("100.0 %")
    odf = pd.DataFrame(report_data[:])
    odf = odf.sort_values(by=[sort_report_by+"_mean"],ascending=False)
    mean_cols = list(filter(lambda x: "mean" in x,list(odf.columns)))
    ocols = ['name']
    ocols.extend(hpks)
    ocols.extend(mean_cols)
    ocols.extend(list(filter(lambda x: x not in ocols,odf.columns)))
    odf = odf[ocols].reset_index(drop=True)
    return odf.copy()
Exemple #4
0
def cv_metrics_df_with_indexes(X,
                               Y,
                               train_indexes,
                               test_indexes,
                               iclf,
                               iclfk={},
                               report_metrics=[
                                   'matthews_corr_coef', 'roc_auc_score',
                                   'f1_score', 'sensitivity', 'specificity'
                               ],
                               norm=False,
                               calc_stats=True,
                               report_name='CLF',
                               sort_metric='matthews_corr_coef_min'):
    output_objs = {}
    output_metrics = {}
    stats_df = []
    report_name_sufix = ''
    total_features = X.shape[-1]
    for train_index, test_index in zip(train_indexes, test_indexes):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        if len(y_test.shape) > 1:
            y_test = y_test.argmax(1)
        if norm == True:
            X_train, X_test = norm_z_score(X_train, X_test)
        start_feature_number = 1
        end_feature_number = total_features + 1
        for feature_number in range(start_feature_number, end_feature_number):
            tmp_scores = output_metrics.get('F' + str(feature_number), {})
            X_train_ = X_train[:, :feature_number]
            X_test_ = X_test[:, :feature_number]
            tmp_scores = fit_and_get_metrics(X_train_, X_test_, y_train,
                                             y_test, iclf, iclfk,
                                             report_metrics, tmp_scores)
            output_metrics['F' + str(feature_number)] = tmp_scores
    if calc_stats == True:
        number_of_features = 1
        for fn in range(number_of_features, total_features + 1):
            fk = 'F' + str(fn)
            metrics = output_metrics[fk]
            metrics_report = {
                'Name': report_name + report_name_sufix,
                'Number of Variables': fn
            }
            for m in metrics.keys():
                stats = metrics_stats(metrics[m], rn=3)
                for sk in stats.keys():
                    metrics_report[m + "_" + sk] = stats[sk]
            stats_df.append(metrics_report)
        stats_df = pd.DataFrame(stats_df).sort_values(
            by=[sort_metric, 'Number of Variables'],
            ascending=[False, True]).reset_index(drop=True)
    return output_metrics.copy(), stats_df.copy()
Exemple #5
0
def loadAnalyserResults(yearList,
                        suffix='noStopLoss_partial_360_noOutlier',
                        pricePerc=True):
    '''
    Takes a list of years as input and loads the
    result files into it.
    '''

    yearStats = {}
    # suffix = 'min_1_stopLoss_20bp_full_360'
    # suffix = 'noStopLoss_partial_360'
    for year in yearList:
        print 'Currently at year:', year
        with open(
                'results/' + suffix + '/' + 'gapStatsDf_' + str(year) + '_' +
                suffix + '.pickle', 'rb') as handle:
            yearStats[year] = pickle.load(handle)

    # Preprocessing to add desired stats
    for year in yearList:
        stockList = yearStats[year].keys()
        for stock in stockList:
            yearStats[year][stock]['prevLow'] = yearStats[year][stock][
                'currLow'].shift(1)
            yearStats[year][stock]['prevHigh'] = yearStats[year][stock][
                'currHigh'].shift(1)
            yearStats[year][stock]['prevOpen'] = yearStats[year][stock][
                'currOpen'].shift(1)
            yearStats[year][stock].dropna(axis=0, how='any', inplace=True)

    stats = yearStats[yearList[0]]
    stockList = stats.keys()
    for year in yearList[1:]:
        for stock in stockList:
            stats[stock] = stats[stock].append(yearStats[year][stock])

    grandStats = pd.DataFrame()
    for stock in stockList:
        tmpDf = pd.DataFrame()
        tmpDf = tmpDf.append(stats[stock])
        tmpDf['stockName'] = stock

        if (pricePerc):
            tmpDf['pricePerc'] = (tmpDf['currOpen']).rolling(window=20).apply(
                lambda x: pd.Series(x).rank(pct=True).iloc[-1])
            tmpDf['pricePerc'] *= 100.0
            tmpDf.dropna(inplace=True)

        grandStats = grandStats.append(deepcopy(tmpDf))

    print 'Loading results complete!'

    return grandStats
Exemple #6
0
def readStatsDafsComputeStandardizationBins(statAndDafFileName,
                                            nBins=50,
                                            pMisPol=0.0):
    stats = {}
    dafs = []
    pMisPolInv = 1 - pMisPol
    misPolarizedSnps, totalSnps = 0, 0
    with open(statAndDafFileName) as statAndDafFile:
        first = True
        for line in statAndDafFile:
            line = line.strip().split()
            if first:
                first = False
                header = line
                assert header[0] == "daf"
                for i in range(1, len(header)):
                    stats[header[i]] = []
            else:
                totalSnps += 1
                if random.random() >= pMisPolInv:
                    dafs.append(1 - float(line[0]))
                    misPolarizedSnps += 1
                else:
                    dafs.append(float(line[0]))
                for i in range(1, len(line)):
                    stats[header[i]].append(float(line[i]))

    statInfo = {}
    for statName in stats.keys():
        stats[statName] = np.array(stats[statName])
        nonan = ~np.isnan(stats[statName])
        score_nonan = stats[statName][nonan]
        daf_nonan = np.array(dafs)[nonan]
        bins = allel.stats.selection.make_similar_sized_bins(daf_nonan, nBins)
        mean_score, _, _ = scipy.stats.binned_statistic(daf_nonan,
                                                        score_nonan,
                                                        statistic=np.mean,
                                                        bins=bins)
        std_score, _, _ = scipy.stats.binned_statistic(daf_nonan,
                                                       score_nonan,
                                                       statistic=np.std,
                                                       bins=bins)
        statInfo[statName] = (mean_score, std_score, bins)
        sys.stderr.write("mispolarized %d of %d (%f%%) "
                         "SNPs when standardizing scores in %s\n" % (
                             misPolarizedSnps,
                             totalSnps,
                             100 * misPolarizedSnps / float(totalSnps),
                             statAndDafFileName,
                         ))
    return statInfo
def _make_stats_tables(stats):
    name_col = [
        k for k in stats.keys() if k != 'intervals' and stats[k] is not None
    ]
    value_col = [
        round(float(stats[k]), 3) for k in name_col if stats[k] is not None
    ]

    interval_names = list(stats['intervals'].keys())
    interval_values = [
        round(float(i[1]), 1) for i in stats['intervals'].values()
    ]

    data = dict(
        names=name_col,
        values=value_col,
    )
    source = ColumnDataSource(data)

    columns = [
        TableColumn(field="names", title="Name"),
        TableColumn(field="values", title="Value"),
    ]
    data_table = DataTable(source=source,
                           columns=columns,
                           width=150,
                           fit_columns=True,
                           index_position=None)

    int_data = dict(
        names=interval_names,
        values=interval_values,
    )
    int_source = ColumnDataSource(int_data)

    int_columns = [
        TableColumn(field="names", title="interval"),
        TableColumn(field="values", title="Max Value"),
    ]

    interval_table = DataTable(source=int_source,
                               columns=int_columns,
                               width=120,
                               fit_columns=True,
                               index_position=None)

    return data_table, interval_table
Exemple #8
0
def xanes_fitting_Binned(im_stack,
                         e_list,
                         refs,
                         method="NNLS",
                         alphaForLM=0.05):
    """Linear combination fit of image data with reference standards"""

    im_stack = resize_stack(im_stack, scaling_factor=10)
    # use a simple filter to find threshold value
    val = filters.threshold_otsu(im_stack[-1])
    en, im1, im2 = np.shape(im_stack)
    im_array = im_stack.reshape(en, im1 * im2)
    coeffs_arr = []
    meanStats = {
        "R_Factor": 0,
        "R_Square": 0,
        "Chi_Square": 0,
        "Reduced Chi_Square": 0
    }

    specs_fitted = 0
    total_spec = im1 * im2
    for i in range(total_spec):
        spec = im_array[:, i]
        # do not fit low intensity/background regions
        if spec[-1] > val:
            specs_fitted += 1
            stats, coeffs = xanes_fitting_1D(spec / spec[-1],
                                             e_list,
                                             refs,
                                             method=method,
                                             alphaForLM=alphaForLM)
            coeffs_arr.append(coeffs)
            for key in stats.keys():
                meanStats[key] += stats[key]
        else:
            pass

    for key, vals in meanStats.items():
        meanStats[key] = np.around((vals / specs_fitted), 6)
    # print(f"{specs_fitted}/{total_spec}")
    return meanStats, np.mean(coeffs_arr, axis=0)
def concatenate_feature_vectors(eeg,stats):
	if eeg.keys() != stats.keys():
		raise ValueError("Input dictionaries have different keys")
	output_structure = {}
	for subset_name in eeg.keys():
		if eeg[subset_name].keys() != stats[subset_name].keys():
			raise ValueError("Input dictionaries have different keys")
		output_structure.update({subset_name : {}})
		for class_name in eeg[subset_name].keys():
			if len(eeg[subset_name][class_name]) != len(stats[subset_name][class_name]):
				raise ValueError("Numbers of frames in dicts are different")
			output_structure[subset_name].update({class_name : []})
			for i in range(len(eeg[subset_name][class_name])):
				frame1 = eeg[subset_name][class_name][i] 
				frame2 = stats[subset_name][class_name][i] 
				if len(frame1.shape) != 1 or len(frame2.shape) != 1:
					raise ValueError("Frame dimensionality not equal to 1")
				newframe = np.concatenate([frame1, frame2])
				output_structure[subset_name][class_name].append(newframe)
	return output_structure
Exemple #10
0
def cv_metrics_stratified_class_report_with_indexes(
        X,
        Y,
        indexes,
        clf,
        clfk={},
        kfold=5,
        shuffle=True,
        report_metrics=[
            'matthews_corr_coef', 'roc_auc_score', 'f1_score', 'sensitivity',
            'specificity'
        ],
        regressor_name='Regressor',
        sort_report_by='roc_auc_score',
        norm=False):
    report_data = []
    metrics = cv_metrics_stratified_class_with_indexes(
        X,
        Y,
        indexes=indexes,
        clf=clf,
        clfk=clfk,
        report_metrics=report_metrics,
        norm=norm)
    metrics_report = {'name': regressor_name}
    for m in metrics.keys():
        stats = metrics_stats(metrics[m], rn=3)
        for sk in stats.keys():
            metrics_report[m + "_" + sk] = stats[sk]
    metrics_report.update(clfk)
    report_data.append(metrics_report.copy())
    odf = pd.DataFrame(report_data[:])
    odf = odf.sort_values(by=[sort_report_by + "_mean"], ascending=False)
    mean_cols = list(filter(lambda x: "mean" in x, list(odf.columns)))
    ocols = ['name']
    ocols.extend(clfk)
    ocols.extend(mean_cols)
    ocols.extend(list(filter(lambda x: x not in ocols, odf.columns)))
    odf = odf[ocols].reset_index(drop=True)
    return odf.copy()
Exemple #11
0
def write_stats(eff, stats, filename, titles, names, all_names, corp_names, process_time, write_time, db_update_obj):
    file = filename + "-stats.tsv"
    if not os.path.exists("Webapp/Files/results/%s/Stats" %(filename)):
        os.makedirs("Webapp/Files/results/%s/Stats" %(filename))
    file_path = os.path.join("Webapp/Files/results/%s/Stats" %(filename), file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
    except Exception as e:
        print(e)
    with open(file_path, "w+") as stat:
        stat.write(file + " was processed in " + str(process_time) + " write back to file in " + str(write_time) +"\n\n")
        stat.write(str(all_names) + " names were extracted from " +filename + "\n" + str(names) + " unique names " + " --- " + str(names - int(corp_names)) + " Personal names and " + str(corp_names) + " Corporate names" + "\n")
        stat.write(str(titles) + " titles were extracted from " + filename + "\n\n")
        stat.write("API searched" +"\t" + "hits" + "\t" + "hit_rate" +"\n")
        for i in stats.keys():
            setattr(db_update_obj, i, str(stats[i]))
            db_update_obj.save()
            stat.write(i + "\t" + str(stats[i]) + "\t" + str((int(stats[i])/names)*100) + "\n")
        stat.write("\n" + "\n")
        if 'LC' in eff.keys():
            stat.write('LC_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["LC"]:
                stat.write(str(i) + "\t")
            stat.write("\n" + "\n")
        if 'VIAF' in eff.keys():
            stat.write('VIAF_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["VIAF"]:
                stat.write(str(i) + "\t")
            stat.write("\n" + "\n")
        if 'work_id' in eff.keys():
            stat.write('work_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["work_id"]:
                stat.write(str(i) + "\t")
            stat.write("\n" + "\n")
        if 'oclcid' in eff.keys():
            stat.write('oclc_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["oclcid"]:
                stat.write(str(i) + "\t")
def detect(imagefilename,threshold=0.5,prior=-1.0):
    paramsdir = '../../data/params/'
    mergepairs = True
    stats = pickle.load(open('%snormalbayes-detectionoutput.pkl' % (paramsdir),'rb'))

    if prior>-1.0:
        stats['posnegratio'] = prior

    # load descriptors from feature file
    patchstep = 25
    patchsize = 30
    #featuretype = 'surf_%d_%d' % (patchstep,patchsize)

    data = {}
    features = []
    nfeatures = 0
    for feature in stats.keys():
        if feature != 'posnegratio':
            descriptorfile = '%s%s_%d_%d' % (imagefilename[:-3],feature,patchstep,patchsize)
            data[feature] = numpy.genfromtxt(descriptorfile)
            features.append(feature)

    # for each descriptor, determine whether there is a match or not
    matches = []
    for i in range(data[features[0]].shape[0]):
        xkey = data[features[0]][i,0]
        ykey = data[features[0]][i,1]
        descriptor = []
        descriptor = {}
        for feature in features:
            descriptor[feature] = data[feature][i,3:]
        p = matchprobability(descriptor,stats,features)
        if p>threshold:
            if (not mergepairs) or ((xkey-patchstep,ykey) not in matches) and ((xkey,ykey-patchstep) not in matches):
                matches.append((xkey,ykey))


    return matches
Exemple #13
0
def write_stats(eff, stats, filename, titles, names, all_names, corp_names, process_time, write_time):
    file = filename + "-stats.tsv"
    if not os.path.exists("results/%s/Stats" %(filename)):
        os.makedirs("results/%s/Stats" %(filename))
    file_path = os.path.join("results/%s/Stats" %(filename), file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
    except Exception as e:
        print(e)
    with open(file_path, "w+") as stat:
        stat.write(file + " was processed in " + str(process_time) + " write back to file in " + str(write_time) +"\n\n")
        stat.write(str(all_names) + " names were extracted from " +filename + "\n" + str(names) + " unique names " + " --- " + str(names - int(corp_names)) + " Personal names and " + str(corp_names) + " Corporate names" + "\n")
        stat.write(str(titles) + " titles were extracted from " + filename + "\n\n")
        stat.write("API searched" +"\t" + "hits" + "\t" + "hit_rate" +"\n")
        for i in stats.keys():
            stat.write(i + "\t" + str(stats[i]) + "\t" + str((int(stats[i])/names)*100) + "\n")
        stat.write("\n" + "\n")
        if 'LC' in eff.keys():
            stat.write('LC_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["LC"]:
                stat.write(str(i) + "\t")
            stat.write("\n" + "\n")
        if 'VIAF' in eff.keys():
            stat.write('VIAF_ID' + '\n' + "names enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["VIAF"]:
                stat.write(str(i) + "\t")
            stat.write("\n" + "\n")
        if 'work_id' in eff.keys():
            stat.write('work_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["work_id"]:
                stat.write(str(i) + "\t")
            stat.write("\n" + "\n")
        if 'oclcid' in eff.keys():
            stat.write('oclc_id' + '\n' + "titles enriched" + "\t" + "average matching score" + "\t" + "median matching score" + "\t" + "variance of matching score" + "\t" + "standard-div of matching score" + "\t" + "hit rate" + "\n")
            for i in eff["oclcid"]:
                stat.write(str(i) + "\t")
def main(ref_path, pred_path, r_nodata, p_nodata, out_txt, pred_scale=1, ax_limit=None):

    sns.set_style('white')
    sns.set_context(context='paper', rc={'patch.linewidth': 0})
    
    t0 = time.time()
    r_nodata = int(r_nodata)
    p_nodata = int(p_nodata)

    out_dir, basename = os.path.split(out_txt)
    t1 = time.time()
    
    # Find and read reference raster
    if not os.path.exists(ref_path):
        raise RuntimeError('ref_path does not exist: %s' % ref_path)
    ds_r = gdal.Open(ref_path)
    ar_r = ds_r.ReadAsArray()
    tx_r = ds_r.GetGeoTransform()
    prj_r = ds_r.GetProjection()
    ds_r = None
    
    # Find and read pred raster
    if not os.path.exists(pred_path):
        raise RuntimeError('pred_path does not exist: %s' % pred_path)
    ds_p = gdal.Open(pred_path)
    ar_p = ds_p.ReadAsArray()
    tx_p = ds_p.GetGeoTransform()
    prj_p = ds_p.GetProjection()
    ds_p = None
    
    if not tx_p == tx_r and prj_p == prj_r:
        raise ValueError('Geo transform and/or projection of reference and prediction rasters do not match.')

    mask = (ar_r != r_nodata) & (ar_p != p_nodata)
    #print ar_p.min()
    ar_r = ar_r[mask].astype(np.int32)
    ar_p = ar_p[mask].astype(np.int32)
    if 'ltbiomass' in ref_path:
         ar_r = ar_r * float(pred_scale)
    else:
         ar_p = ar_p * float(pred_scale)
    #import pdb; pdb.set_trace()
    
    # Calc stats
    rmse, rmspe, r2, r, gmfr_a, gmfr_b = calc_stats(ar_r, ar_p)
    stats = {'n_pixels': ar_p.size,
                  'rmse': rmse,
                  'r2': r2,
                  'pearsonr': r,
                  'odr_intercept': gmfr_a,
                  'odr_slope': gmfr_b
                  }
    
    # Make 2D histograms
    xlabel = 'reference'
    ylabel = 'predicted'
    this_bn = '%s_%s_vs_%s.png' % (basename.replace('.txt', ''), os.path.basename(ref_path), os.path.basename(pred_path))
    title = this_bn.replace('_vs_', ' vs ').replace('.png','')
    out_png = os.path.join(out_dir, this_bn)
    ax = histogram_2d(ar_r, ar_p, out_png, hexplot=False, cmap='plasma', xlabel=xlabel, ylabel=ylabel, bins=50)
    ax_limit = max(max(ax.get_xlim(), ax_limit))
    plt.sca(ax)
    
    # Plot GMFR (RMA) regression line
    max_val = max(ar_r.max(), ar_p.max())
    x = np.array([0, max_val + 100])
    y = x * gmfr_b + gmfr_a
    plt.plot(x, y, '-', lw=2, color='k')
    label_text = '$r^2$ = %.3f' %  r2

    plt.suptitle(title)
    plt.title(label_text, fontsize=12)
    
    #set plotting limits. 
    if not ax_limit:
        ax_limit = max_val
    plt.ylim((0, ax_limit))
    plt.xlim((0, ax_limit))
    
    plt.savefig(out_png, dpi=300)
         
    plt.clf()
    
    df_xy = pd.DataFrame({'id': np.arange(ar_r.size),
                          'landtrendr': ar_p.astype(np.int16),
                          'lidar': ar_r.astype(np.int16)
                          })
    df_xy.set_index('id', inplace=True)
    df_xy.to_csv(out_png.replace('.png', '_xy.txt'))
    
    desc = '2D histograms made with the following parameters:\n'
    desc += '\tref_path: %s\n' % ref_path
    desc += '\tpred_path: %s\n' % pred_path
    desc += '\tr_nodata: %s\n' % r_nodata
    desc += '\tp_nodata: %s\n' % p_nodata
    desc += '\tout_txt: %s\n' % out_txt.replace('_stats.txt', '.txt')
    desc += '\tpred_scale: %s\n' % pred_scale
    desc += '\nStats for this comparison:'
    for k in sorted(stats.keys()):
        stat_str = '%s: %s' % (k, stats[k])
        desc += '\n\t' + stat_str
        print stat_str
    #desc += '\n\t'.join(['%s: %s' % (k, stats[k]) for k in sorted(stats.keys())])

    createMetadata(sys.argv, out_txt, description=desc)
    
    print '\nText file written to', out_txt
    print 'Total time: %.1f minutes' % ((time.time() - t0)/60)
Exemple #15
0
def marc_process(processing_files, apis):
    #proccess start time
    tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    db_update_obj = P_progress(pid=processing_files)
    db_update_obj.save()
    file = "Webapp/source/%s" % (processing_files.name)
    #convert .mrc to MARC/XML
    Marc_XML = MARC_XML(file)
    files = Marc_XML.convert_marc_xml(db_update_obj)
    tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    file = os.path.join('', files)
    filename = files.replace('.xml', '').replace('Webapp/Files/Processing/',
                                                 '')
    print("processing " + filename)
    ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    log_file = str(filename) + "-error-logs"
    output = str(filename) + "-enhanced.xml"
    clearLogs(log_file, filename)
    query_type = "/authorities/names"
    # extracting names and titles from BIBFRAME
    db_update_obj.stage = "Extracting_names_and_titles"
    db_update_obj.save()
    mergecheck = True
    bib_object = Bibframe(file, log_file, mergecheck)
    transformed = bib_object.convert_bibframe()
    names = bib_object.extract_names(transformed)[0]
    titles = bib_object.extract_names(transformed)[1]
    #getting corp names (for stat report)
    all_names = bib_object.extract_names(transformed)[2]
    corp_names = bib_object.extract_names(transformed)[3]
    print(str(all_names) + " names were extrected from " + filename)
    print(
        str(len(names)) + " unique names were extracted from " + filename +
        " --- " + str(len(names) - corp_names) + " Personal names and " +
        str(corp_names) + " Corporate names")
    print(str(len(titles)) + " titles were extracted from " + filename)
    db_update_obj.all_names = str(len(names))
    db_update_obj.all_titles = str(len(titles))
    db_update_obj.p_names = str(len(names) - corp_names)
    db_update_obj.c_names = str(corp_names)
    db_update_obj.save()
    #dictionaries for storing URIs (names and titles) and stats
    enriched_names = {}
    enriched_titles = {}
    stats = {}
    print("enriching names")
    # iterate over the name dictionary
    db_update_obj.stage = "Enriching_names"
    db_update_obj.save()
    for index, item in enumerate(names.keys()):
        db_update_obj.name_index = index + 1
        db_update_obj.save()
        name = item.split('-_-_-')[0]
        #print(index+1, name)
        enriched_names[item] = []
        for api in apis:
            #check if the stat for the API already exists
            if api in stats.keys():
                pass
            else:
                stats[api] = 0
            # getting the API method
            name_result = APIFactory().get_API(name, query_type, api, log_file)
            # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key
            if name_result:
                enriched_names[item].append(name_result)
                # add number of results to be used latter in stats report
                stats[api] = stats[api] + len(name_result)
    print("enriching titles")
    # iterate over the title dictionary
    db_update_obj.stage = "Enriching_titles"
    db_update_obj.save()
    for index, title in enumerate(titles.keys()):
        db_update_obj.title_index = index + 1
        db_update_obj.save()
        #print(index+1, title)
        for authors in titles[title]['authors']:
            author = authors.split('-_-_-')[0]
            key = str(author) + "-_-_-" + str(title)
            enriched_titles[key] = []
            title_result = APIFactory().get_API(author, title, 'search_OCLC',
                                                log_file)
            if title_result:
                enriched_titles[key].append(title_result)
    # getting rid of unwanted things
    db_update_obj.stage = "Optimization"
    db_update_obj.save()
    name_results = clean_up(enriched_names)
    title_result = clean_up(enriched_titles)
    # get the best URI each API (highest score) and storing it in final_names and final_titles
    result_names_Object = Results(name_results, names, file, 'name', log_file)
    result_names_Object.maximizer()
    final_names = result_names_Object.mapping()
    result_title_Object = Results(title_result, titles, file, 'title',
                                  log_file)
    final_titles = result_title_Object.mapping()
    tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    #write back the URIs to the BIBFRAME file
    db_update_obj.stage = "Writing_to_BIBFRAME"
    db_update_obj.save()
    write(final_names, final_titles, file, output, log_file, filename)
    eff = get_stat(final_names, len(names), final_titles, len(titles),
                   filename)
    stats['names-enriched'] = len(final_names)
    tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(
        tff, '%H:%M:%S')
    file_process_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(
        tfs, '%H:%M:%S')
    write_stats(eff, stats, filename, len(titles), len(names), all_names,
                corp_names, file_process_time, write_time, db_update_obj)
    #removing temp-file.xml
    delete_temp()
    print(filename + " processed in: ", file_process_time,
          " --- writing process :", write_time)
    tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime(
        tps, '%H:%M:%S')
    db_update_obj.stage = "The process was completed in %s" % (process_time)
    db_update_obj.save()
    print("walltime:", process_time)
    add_to_archive(processing_files, db_update_obj, len(final_names),
                   len(final_titles))
Exemple #16
0
def write_fasta_by_location(peaks):
    seqs = {}
    stats = {}
    for index, row in peaks.iterrows():
        seqs.setdefault(row['location'], '')
        seqs[row['location']] += '\n>{i}_{g}\n{s}'.format(i=index,
                                                          g=row['gene_name'],
                                                          s=row['seq'])
        stats.setdefault(row['gene_name'], {'locations': []})
        stats[row['gene_name']]['locations'].append(row['location'])
    fname = {
        """3'UTR""": '3prime_utr.fa',
        """5'UTR""": '5prime_utr.fa',
        """CDS""": 'cds.fa',
        """ncRNA""": 'ncRNA.fa'
    }
    for label in seqs:
        with open('data/fasta/{f}'.format(f=fname[label]), 'w') as fh:
            fh.write(seqs[label])
    with open('data/fasta/all_locations_combined_fbfs.fa', 'w') as fh:
        fh.write("".join(seqs.values()))
    five_and_three = set()
    cds_and_three = set()
    five_cds_and_three = set()
    five_and_cds = set()
    only_three = set()
    only_cds = set()
    only_five = set()
    has_secondary = set()
    has_five = set()
    has_three = set()
    has_cds = set()
    for gene in stats:
        if ("""3'UTR"""
                in stats[gene]['locations']) and ("""5'UTR"""
                                                  in stats[gene]['locations']):
            five_and_three.add(gene)
        if ("""3'UTR"""
                in stats[gene]['locations']) and ("""CDS"""
                                                  in stats[gene]['locations']):
            cds_and_three.add(gene)
        if ("""3'UTR""" in stats[gene]['locations']) and (
                """5'UTR"""
                in stats[gene]['locations']) and ("""CDS"""
                                                  in stats[gene]['locations']):
            five_cds_and_three.add(gene)
        if ("""5'UTR"""
                in stats[gene]['locations']) and ("""CDS"""
                                                  in stats[gene]['locations']):
            five_and_cds.add(gene)
        if "5'UTR" in stats[gene]['locations']:
            has_five.add(gene)
        if "3'UTR" in stats[gene]['locations']:
            has_three.add(gene)
        if "CDS" in stats[gene]['locations']:
            has_cds.add(gene)

        if set(stats[gene]['locations']) == set(["3'UTR"]):
            only_three.add(gene)
        if set(stats[gene]['locations']) == set(["5'UTR"]):
            only_five.add(gene)
        if set(stats[gene]['locations']) == set(["CDS"]):
            only_cds.add(gene)
        if len(stats[gene]['locations']) > 1:
            has_secondary.add(gene)
    print """
    Number of genes: {total}
    Number of genes with a 3' peak: {a_three}
    Number of genes with a CDS peak: {a_cds}
    Number of genes with a 5' peak: {a_five}
    Number of genes with only 3' peaks: {o_3}
    Number of genes with only CDS peaks: {o_cds}
    Number of genes with only 5' peaks: {o_5}
    Number of genes with 3' and 5' peaks: {a}
    Number of genes with 3' and CDS peaks: {b}
    Number of genes with 5' and CDS peaks: {five_and_cds}
    Number of genes with 5' and 3' and CDS peaks: {c}
    Number of genes with a secondary peak: {has_second}
    """.format(total=len(list(set(stats.keys()))),
               a_three=len(has_three),
               a_cds=len(has_cds),
               a_five=len(has_five),
               o_3=len(list(only_three)),
               o_cds=len(list(only_cds)),
               o_5=len(list(only_five)),
               a=len(list(five_and_three)),
               b=len(list(cds_and_three)),
               five_and_cds=len(five_and_cds),
               c=len(list(five_cds_and_three)),
               has_second=len(list(has_secondary)))
    labels = ['ncRNA', """3'UTR""", """5'UTR""", 'CDS']
def binaryStats(saveLocation='',
                ECC=True,
                PB=True,
                SEMI=True,
                hist=False,
                numBins=100,
                inf='',
                to_return=False):
    """
    Plots statistics of binary black hoels orbits from the various snapshots as color coded CDFs
    'saveLocation' is the folder in which to save the plots
    'ECC' True to plot eccentricity
    'PB' True to plot orbital period
    'SEMI' True to plot orbital semi-major axis
    'hist' True to plot an idividual histogram for each snapshot (WARNING: very slow)
    'numBins' to set the number of bins if 'hist' is True
    'inf' is the the suffix for the relevant shortfiles (usually the simulation name, e.g. 'N10K_r10_Z01_1')
    'to_return' set to true if instead of ploting the data, return it instead
    """
    if not saveLocation == '':
        if not os.path.exists(saveLocation):
            os.makedirs(saveLocation)
    columns = [0]
    indicies = [1, 1, 1]
    if ECC:
        columns = columns + [6]
        indicies[1] += 1
        indicies[2] += 1
    if PB:
        columns = columns + [7]
        indicies[2] += 1
    if SEMI:
        columns = columns + [8]
    bevData, meta = ip.bh_data('bev.82', columns, meta_data={}, info=inf)
    stats = {}
    for val in bevData:
        if not val[0] in stats:
            stats[val[0]] = {'ECC': [], 'PB': [], 'SEMI': []}
        if ECC:
            stats[val[0]]['ECC'].append(val[indicies[0]])
        if PB:
            stats[val[0]]['PB'].append(val[indicies[1]])
        if SEMI:
            stats[val[0]]['SEMI'].append(val[indicies[2]])
    if to_return:
        return (stats)
    all_keys = list(stats.keys())
    all_keys.sort()
    stats_keys = []
    for key in all_keys:
        non_zero = False
        for sub_key in stats[key]:
            if stats[key][sub_key] != []:
                non_zero = True
        if non_zero:
            stats_keys.append(key)
    plt.figure(1)
    colormap = plt.cm.coolwarm
    plt.gca().set_prop_cycle(
        cycler('color',
               [colormap(i) for i in np.linspace(0, 0.9, len(stats))]))
    plt.figure(2)
    plt.gca().set_prop_cycle(
        cycler('color',
               [colormap(i) for i in np.linspace(0, 0.9, len(stats))]))
    plt.figure(3)
    plt.hold(True)
    plt.gca().set_prop_cycle(
        cycler('color',
               [colormap(i) for i in np.linspace(0, 0.9, len(stats))]))
    for i, key in enumerate(stats_keys):
        temp_AU_list = []
        for item in stats[key]['SEMI']:
            AU = (10.0**item) * 0.00465047
            temp_AU_list.append(math.log10(AU))
        stats[key]['SEMI'] = temp_AU_list
        stats[key]['ECC'].sort()
        stats[key]['PB'].sort()
        stats[key]['SEMI'].sort()
        ECC_tot = _buildFraction(stats[key]['ECC'])
        PB_tot = _buildFraction(stats[key]['PB'])
        SEMI_tot = _buildFraction(stats[key]['SEMI'])
        if ECC:
            plt.figure(1)
            ECC_plot, = plt.step(([0] + stats[key]['ECC']), ([0] + ECC_tot),
                                 where='post')
        if PB:
            plt.figure(2)
            PB_plot, = plt.step(([0] + stats[key]['PB']), ([0] + PB_tot),
                                where='post')
        if SEMI:
            plt.figure(3)
            SEMI_plot, = plt.step(([0] + stats[key]['SEMI']), ([0] + SEMI_tot),
                                  where='post')
        if i == 0:
            if ECC:
                ECC_pi = ECC_plot
            if PB:
                PB_pi = PB_plot
            if SEMI:
                SEMI_pi = SEMI_plot
        if i == (len(stats_keys) - 1):
            if ECC:
                ECC_pf = ECC_plot
            if PB:
                PB_pf = PB_plot
            if SEMI:
                SEMI_pf = SEMI_plot
    min_key = stats_keys[0]
    max_key = stats_keys[-1]
    if ECC:
        plt.figure(5)
        Z = [[0, 0], [0, 0]]
        levels = range(int(min_key), int(max_key), 5)
        CS3 = plt.contourf(Z, levels, cmap=colormap)
        plt.clf()
        plt.figure(1)
        x1, x2, y1, y2 = plt.axis()
        plt.axis([x1, x2, 0, 1.1])
        plt.colorbar(CS3)
        plt.title('Binary Black Hole Eccentricity CDF')
        plt.xlabel('Eccentricity')
        plt.ylabel('Eccentricity Fraction')
        plt.savefig((saveLocation + 'ECC.png'))
    if PB:
        plt.figure(5)
        Z = [[0, 0], [0, 0]]
        levels = range(int(min_key), int(max_key), 5)
        CS3 = plt.contourf(Z, levels, cmap=colormap)
        plt.clf()
        plt.figure(2)
        x1, x2, y1, y2 = plt.axis()
        plt.axis([x1, x2, 0, 1.1])
        plt.colorbar(CS3)
        plt.title('Binary Black Hole Period CDF')
        plt.xlabel('Log_10( Period (days))')
        plt.ylabel('Period Fraction')
        plt.savefig((saveLocation + 'PB.png'))

        time = []
        max = []
        mean = []
        median = []
        mode = []
        min = []
        key_list = stats.keys()
        key_list.sort()
        for key in key_list:
            npList = np.asarray(stats[key]['PB'])
            modeList = scipy.stats.mode(npList).mode
            for i in range(len(modeList)):
                time.append(key)
                max.append(np.amax(npList))
                mean.append(np.mean(npList))
                median.append(np.median(npList))
                mode.append(modeList[i])
                min.append(np.amin(npList))
        plt.figure(4)
        plt.plot(time, max, '-')
        plt.plot(time, mean, '-')
        plt.plot(time, median, '-')
        plt.plot(time, mode, '-')
        plt.plot(time, min, '-')
        plt.legend(['Max', 'Mean', 'Median', 'Mode', 'Min'])
        plt.title('Black Hole Binary Statistics Over Time')
        plt.xlabel('Physical Time (MY)')
        plt.ylabel('Log_10( Period (days))')
        plt.savefig((saveLocation + 'bBH_PBStats.png'))
    if SEMI:
        plt.figure(5)
        Z = [[0, 0], [0, 0]]
        levels = range(int(min_key), int(max_key), 5)
        CS3 = plt.contourf(Z, levels, cmap=colormap)
        plt.clf()
        plt.figure(3)
        x1, x2, y1, y2 = plt.axis()
        plt.axis([x1, x2, 0, 1.1])
        plt.colorbar(CS3)
        plt.title('Binary Black Hole Semi-major Axis CDF')
        plt.xlabel('Log_10( Semi-major Axis (Au))')
        plt.ylabel('Semi-major Axis Fraction')
        plt.savefig((saveLocation + 'SEMI.png'))
    plt.close('all')
    if hist:
        if ECC:
            for time in stats:
                plt.figure()
                n, bins, patches = plt.hist(stats[time]['ECC'],
                                            numBins,
                                            normed=False,
                                            histtype='bar',
                                            rwidth=1)
                plt.title(
                    'Eccentricity Distribution of Black Hole Binaries: {0} MY'.
                    format(time))
                plt.xlabel('Eccentricity')
                plt.ylabel('N')
                plt.savefig((saveLocation + 'ECC.{0}MY.png'.format(time)))
                plt.close('all')
        if PB:
            for time in stats:
                plt.figure()
                n, bins, patches = plt.hist(stats[time]['PB'],
                                            numBins,
                                            normed=False,
                                            histtype='bar',
                                            rwidth=1)
                plt.title('Period Distribution of Black Hole Binaries: {0} MY'.
                          format(time))
                plt.xlabel('Log_10( Period (days))')
                plt.ylabel('N')
                plt.savefig((saveLocation + 'PB.{0}MY.png'.format(time)))
                plt.close('all')
        if SEMI:
            for time in stats:
                plt.figure()
                n, bins, patches = plt.hist(stats[time]['SEMI'],
                                            numBins,
                                            normed=False,
                                            histtype='bar',
                                            rwidth=1)
                plt.title(
                    'Semi-major Axis Distribution of Black Hole Binaries: {0} MY'
                    .format(time))
                plt.xlabel('Log_10( Semi-major Axis (Au))')
                plt.ylabel('N')
                plt.savefig((saveLocation + 'SEMI.{0}MY.png'.format(time)))
                plt.close('all')
    plt.close('all')
Exemple #18
0
def main():
    #proccess start time
    tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    # delete files in the processing folder
    clear_processing()
    #convert .mrc to MARC/XML
    Marc_XML = MARC_XML()
    Marc_XML.convert_marc_xml()
    BIBFRAME = BIB_builder()
    BIBFRAME.merger()
    folder = 'Processing'
    #iterate over BIBFRAME files
    for files in os.listdir(folder):
        tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        file = os.path.join(folder, files)
        filename = files.replace('.xml', '')
        print("processing " + filename)
        ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        log_file = str(filename) + "-error-logs"
        print(log_file)
        output = str(filename) + "-enhanced.xml"
        clearLogs(log_file, filename)
        # all the APIs that will be searched - for a new API, add a new method to SearchAPI class and call it with adding a staticmethod to APIFactory
        apis = [
            'search_api_LC', 'search_api_LCS', 'search_api_VF',
            'search_api_VFP', 'search_api_VFC'
        ]
        #this is needed for LC APIs
        query_type = "/authorities/names"
        # extracting names and titles from BIBFRAME
        bib_object = Bibframe(file, log_file)
        transformed = bib_object.convert_bibframe()
        names = bib_object.extract_names(transformed)[0]
        titles = bib_object.extract_names(transformed)[1]
        #getting corp names (for stat report)
        all_names = bib_object.extract_names(transformed)[2]
        corp_names = bib_object.extract_names(transformed)[3]
        print(str(all_names) + " names were extrected from " + filename)
        print(
            str(len(names)) + " unique names were extracted from " + filename +
            " --- " + str(len(names) - corp_names) + " Personal names and " +
            str(corp_names) + " Corporate names")
        print(str(len(titles)) + " titles were extracted from " + filename)
        #dictionaries for storing URIs (names and titles) and stats
        enriched_names = {}
        enriched_titles = {}
        stats = {}
        print("enriching names")
        # iterate over the name dictionary
        for index, item in enumerate(names.keys()):
            name = item.split('-_-_-')[0]
            print(index + 1, name)
            enriched_names[item] = []
            for api in apis:
                #check if the stat for the API already exists
                if api in stats.keys():
                    pass
                else:
                    stats[api] = 0
                # getting the API method
                name_result = APIFactory().get_API(name, query_type, api,
                                                   log_file)
                # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key
                if name_result:
                    enriched_names[item].append(name_result)
                    # add number of results to be used latter in stats report
                    stats[api] = stats[api] + len(name_result)
        print("enriching titles")
        # iterate over the title dictionary
        for index, title in enumerate(titles.keys()):
            print(index + 1, title)
            for authors in titles[title]['authors']:
                author = authors.split('-_-_-')[0]
                key = str(author) + "-_-_-" + str(title)
                enriched_titles[key] = []
                title_result = APIFactory().get_API(author, title,
                                                    'search_OCLC', log_file)
                if title_result:
                    enriched_titles[key].append(title_result)
        # getting rid of unwanted things
        name_results = clean_up(enriched_names)
        title_result = clean_up(enriched_titles)
        # get the best URI each API (highest score) and storing it in final_names and final_titles
        result_names_Object = Results(name_results, names, file, 'name',
                                      log_file)
        result_names_Object.maximizer()
        final_names = result_names_Object.mapping()
        result_title_Object = Results(title_result, titles, file, 'title',
                                      log_file)
        final_titles = result_title_Object.mapping()
        eff = get_stat(final_names, len(names), final_titles, len(titles),
                       filename)
        stats['names-enriched'] = len(final_names)
        tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        #write back the URIs to the BIBFRAME file
        write(final_names, final_titles, file, output, log_file, filename)
        tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(
            tff, '%H:%M:%S')
        file_process_time = datetime.strptime(
            tfw, '%H:%M:%S') - datetime.strptime(tfs, '%H:%M:%S')
        write_stats(eff, stats, filename, len(titles), len(names), all_names,
                    corp_names, file_process_time, write_time)
        #removing temp-file.xml
        delete_temp()
        print(filename + " processed in: ", file_process_time,
              " --- writing process :", write_time)
    tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime(
        tps, '%H:%M:%S')
    print("walltime:", process_time)
Exemple #19
0
    pass
stats.dropna(axis='columns', how='all', inplace=True)
#'no_pop_frac', 'no_thresh_frac', 'pop_abs_mis_95width',
#       'pop_abs_mis_median', 'rms_test', 'thresh_rel_mis_95width',
#       'thresh_rel_mis_median', 'l2_norm_weighted'
#print(stats.max())
#print(stats.min())
#print(stats.mean())
#print(stats.abs().mean())
del stats['pop_abs_mis_95width']
del stats['thresh_rel_mis_95width']
stats['rms'] = stats.pop('rms')
stats['thresh'] = stats.pop('thresh_rel_mis_median').abs().apply(np.max)
stats['no_thresh_frac'] = stats.pop('no_thresh_frac').apply(np.max)
stats['pop'] = (14 - stats.pop('pop_abs_mis_median').abs()).apply(np.max)
if 'wobble_tot' in stats.keys():
    stats['wobble_tot'] = stats.pop('wobble_tot').apply(np.max)
    stats['wobble_unstab'] = stats.pop('wobble_unstab').apply(np.max)
stats['pop_frac'] = (1 - stats.pop('no_pop_frac')).apply(np.max)
#stats.dropna(inplace=True)
try:
    del stats['dual_thresh_mismatch_95width']
    stats['thresh_mismatch'] = stats.pop(
        'dual_thresh_mismatch_median').abs().apply(np.max)
except KeyError:
    pass
#(stats/stats.max()).nsmallest(10, 'rms').plot.bar()

fig = plt.figure()
gs = gridspec.GridSpec(2,
                       1,
def calculate_accuracy():
    stats = {}
    stats['all'] = {'tp': 0, 'fp': 0, 'fn': 0, 'btp': 0, 'bfp': 0, 'bfn': 0}
    for c in classes:
        stats[c] = {'tp': 0, 'fp': 0, 'fn': 0, 'btp': 0, 'bfp': 0, 'bfn': 0}

    gt_boxes = []
    predicted_boxes = []
    gt_box_label = []
    predicted_box_label = []
    for obj_id in set(gt_obj_id):
        mask = gt_obj_id == obj_id
        inliers = point_orig_list[mask, :3]
        prediction = gt_cls_id[mask][0]
        gt_boxes.append(mask)
        gt_box_label.append(prediction)
    for obj_id in set(predicted_obj_id):
        mask = predicted_obj_id == obj_id
        if numpy.sum(mask) > 50:
            inliers = point_orig_list[mask, :3]
            prediction = scipy.stats.mode(predicted_cls_id[mask])[0][0]
            predicted_boxes.append(mask)
            predicted_box_label.append(prediction)
    predicted_boxes = numpy.array(predicted_boxes)
    gt_boxes = numpy.array(gt_boxes)
    matched = numpy.zeros(len(predicted_boxes), dtype=bool)
    print('%d/%d boxes' % (len(predicted_boxes), len(gt_boxes)))
    for i in range(len(gt_boxes)):
        same_cls = gt_box_label[i] == predicted_box_label
        if numpy.sum(same_cls) == 0:
            stats[classes[gt_box_label[i]]]['bfn'] += 1
            stats['all']['bfn'] += 1
            continue
        intersection = numpy.sum(numpy.logical_and(gt_boxes[i],
                                                   predicted_boxes[same_cls]),
                                 axis=1)
        IOU = intersection / (1.0 * numpy.sum(gt_boxes[i]) + numpy.sum(
            predicted_boxes[same_cls], axis=1) - intersection)
        if IOU.max() > 0.5:
            matched[numpy.nonzero(same_cls)[0][numpy.argmax(IOU)]] = True
            stats[classes[gt_box_label[i]]]['btp'] += 1
            stats['all']['btp'] += 1
        else:
            stats[classes[gt_box_label[i]]]['bfn'] += 1
            stats['all']['bfn'] += 1
    for i in range(len(predicted_boxes)):
        if not matched[i]:
            stats[classes[predicted_box_label[i]]]['bfp'] += 1
            stats['all']['bfp'] += 1

    for g in range(len(predicted_cls_id)):
        if gt_cls_id[g] == predicted_cls_id[g]:
            stats[classes[int(gt_cls_id[g])]]['tp'] += 1
            stats['all']['tp'] += 1
        else:
            stats[classes[int(gt_cls_id[g])]]['fn'] += 1
            stats['all']['fn'] += 1
            stats[classes[predicted_cls_id[g]]]['fp'] += 1
            stats['all']['fp'] += 1

    prec_agg = []
    recl_agg = []
    bprec_agg = []
    brecl_agg = []
    iou_agg = []
    print("%10s %6s %6s %6s %5s %5s %5s %3s %3s %3s %5s %5s" %
          ('CLASS', 'TP', 'FP', 'FN', 'PREC', 'RECL', 'IOU', 'BTP', 'BFP',
           'BFN', 'PREC', 'RECL'))
    for c in sorted(stats.keys()):
        try:
            stats[c]['pr'] = 1.0 * stats[c]['tp'] / (stats[c]['tp'] +
                                                     stats[c]['fp'])
        except ZeroDivisionError:
            stats[c]['pr'] = 0
        try:
            stats[c]['rc'] = 1.0 * stats[c]['tp'] / (stats[c]['tp'] +
                                                     stats[c]['fn'])
        except ZeroDivisionError:
            stats[c]['rc'] = 0
        try:
            stats[c]['IOU'] = 1.0 * stats[c]['tp'] / (
                stats[c]['tp'] + stats[c]['fp'] + stats[c]['fn'])
        except ZeroDivisionError:
            stats[c]['IOU'] = 0
        try:
            stats[c]['bpr'] = 1.0 * stats[c]['btp'] / (stats[c]['btp'] +
                                                       stats[c]['bfp'])
        except ZeroDivisionError:
            stats[c]['bpr'] = 0
        try:
            stats[c]['brc'] = 1.0 * stats[c]['btp'] / (stats[c]['btp'] +
                                                       stats[c]['bfn'])
        except ZeroDivisionError:
            stats[c]['brc'] = 0
        if c not in ['all']:
            print(
                "%10s %6d %6d %6d %5.3f %5.3f %5.3f %3d %3d %3d %5.3f %5.3f" %
                (c, stats[c]['tp'], stats[c]['fp'], stats[c]['fn'],
                 stats[c]['pr'], stats[c]['rc'], stats[c]['IOU'],
                 stats[c]['btp'], stats[c]['bfp'], stats[c]['bfn'],
                 stats[c]['bpr'], stats[c]['brc']))
            prec_agg.append(stats[c]['pr'])
            recl_agg.append(stats[c]['rc'])
            iou_agg.append(stats[c]['IOU'])
            bprec_agg.append(stats[c]['bpr'])
            brecl_agg.append(stats[c]['brc'])
    c = 'all'
    print("%10s %6d %6d %6d %5.3f %5.3f %5.3f %3d %3d %3d %5.3f %5.3f" %
          ('all', stats[c]['tp'], stats[c]['fp'], stats[c]['fn'],
           stats[c]['pr'], stats[c]['rc'], stats[c]['IOU'], stats[c]['btp'],
           stats[c]['bfp'], stats[c]['bfn'], stats[c]['bpr'], stats[c]['brc']))
    print("%10s %6d %6d %6d %5.3f %5.3f %5.3f %3d %3d %3d %5.3f %5.3f" %
          ('avg', stats[c]['tp'], stats[c]['fp'], stats[c]['fn'],
           numpy.mean(prec_agg), numpy.mean(recl_agg), numpy.mean(iou_agg),
           stats[c]['btp'], stats[c]['bfp'], stats[c]['bfn'],
           numpy.mean(bprec_agg), numpy.mean(brecl_agg)))
def plot_cdf(stats, out_pathname, legend_x):

    # color=iter(matplotlib.cm.Set1(np.linspace(0,0.5,len(stats.keys()))))
    colors = iter(['red', 'black', 'red', 'black'])
    print("len(stats)=%d" % (len(stats)))
    x_max = 0
    matplotlib.rcParams['xtick.labelsize'] = 20
    matplotlib.rcParams['ytick.labelsize'] = 20

    if len(stats) == 2:
        folge = ['Vanilla', 'Gekuerzt']  #['Einfach', 'Botnetz']
        linecycler = cycle(["-", "--"])
    elif len(stats) == 4:
        folge = [
            'Vanilla_Einfach', 'Vanilla_Botnetz', 'Gekuerzt_Einfach',
            'Gekuerzt_Botnetz'
        ]
        linecycler = cycle(["-", "-", "--", "--"])
    else:
        print("error. len(stats)=%d" % (len(stats)))
        sys.exit(1)

    for line in folge:
        # print(line)
        try:
            data = stats[line]
        except:
            line_n = [s for s in stats.keys() if line in s][0]
            print("%s was found in stats as %s." % (line, line_n))
            data = stats[line_n]

        # sorted_data=np.sort(data)
        # yvals=np.arange(1.0,float(len(sorted_data))+1.0)/float(len(sorted_data))

        # print(data)
        #yvals=np.arange(len(sorted_data))/float(len(sorted_data))
        # # new x values
        # xn_ax = np.linspace(sorted_data.min(), sorted_data.max(), 200)
        # print(len(xn_ax))
        # # new y values
        # yn_ax = spline(sorted_data, yvals, xn_ax)
        # print(len(yn_ax))

        data_max = max(data)
        data_shown = filter(lambda x: x < data_max, data)
        shown_percentile = float(len(data_shown)) / len(data)
        print("shown_percentile:", shown_percentile)
        x, y = getcdf(data, shown_percentile)

        # print(find_nearest(y, 0.50))
        if shown_percentile > 0:
            q, i = find_nearest(y, 0.80)
            # print(out_pathname, legend_x)
            print("%s: data[%d]=%f equals %f."\
                %(line,i,y[i],x[i]))

            q, i = find_nearest(y, 0.70)
            # print(out_pathname, legend_x)
            print("%s: data[%d]=%f equals %f."\
                %(line,i,y[i],x[i]))

            q, i = find_nearest(y, 0.50)
            # print(out_pathname, legend_x)
            print("%s: data[%d]=%f equals %f."\
                %(line,i,y[i],x[i]))

            q, i = find_nearest(y, 0.30)
            # print(out_pathname, legend_x)
            print("%s: data[%d]=%f equals %f."\
                %(line,i,y[i],x[i]))

            q, i = find_nearest(y, 0.20)
            # print(out_pathname, legend_x)
            print("%s: data[%d]=%f equals %f."\
                %(line,i,y[i],x[i]))
            # print(line)
            # print(y[i],i)
            # print(x[i])
        # print(x)
        if len(x) != 0 and x_max < max(x):
            x_max = max(x)
        matplotlib.pyplot.plot(x,
                               y,
                               color=next(colors),
                               label=line,
                               linewidth=2,
                               linestyle=next(linecycler))

    # matplotlib.pyplot.style.use('ggplot')
    # matplotlib.pyplot.figure(figsize=(8,6), dpi=72, facecolor="white")
    matplotlib.pyplot.legend(loc='best', fontsize=20)  #, fontsize = 'small')

    # if legend_x == 'Days from first stream':
    #     matplotlib.pyplot.xlim(xmin=0.0, xmax=x_max)
    # else:
    #     matplotlib.pyplot.xlim(xmin=0.0)
    # matplotlib.pyplot.xscale('symlog')
    matplotlib.pyplot.xlim(xmin=0.0, xmax=x_max)  # x_max) #0.2)
    matplotlib.pyplot.ylim(ymin=0.0)

    # matplotlib.pyplot.yscale('log')
    # matplotlib.pyplot.yscale('close_to_one')
    matplotlib.pyplot.yticks(np.arange(0, 1.2, 0.2))

    # matplotlib.pyplot.xticks(np.arange(0, x_max, 0.1))

    matplotlib.pyplot.xlabel(legend_x, fontsize=20)
    matplotlib.pyplot.ylabel('eCDF', fontsize=20)
    #    matplotlib.pyplot.title(title, fontsize=fontsize)
    # matplotlib.pyplot.grid()
    matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.savefig(out_pathname)
    matplotlib.pyplot.close()

    #########################################################################
    # Export Data
    #########################################################################
    a = []
    h = []
    for line in folge:
        h.append(line)
        data = stats[line]
        y, x = getcdf(data, 1)
        a.append(x)
        a.append(y)

    h = ", ".join(h)
    np.savetxt(out_pathname + ".dat", np.transpose(a), header=h, fmt="%10s")
Exemple #22
0
def cv_metrics_stratified_class_with_indexes_and_transform(
        X,
        Y,
        indexes,
        iclf,
        iclfk={},
        transform=None,
        kfold=5,
        shuffle=True,
        report_metrics=[
            'matthews_corr_coef', 'roc_auc_score', 'f1_score', 'sensitivity',
            'specificity'
        ],
        norm=False,
        calc_stats=True,
        report_name='CLF',
        sort_metric='roc_auc_score_min',
        transformations=[],
        features_top_ns=[],
        X_names=[],
        vectors=[],
        vector=None,
        allow_x_list_size=1):
    output_objs = {}
    output_metrics = {}
    stats_df = []
    report_name_sufix = ''
    report_name_sufix_xs = ''
    conditions = [
        type(X) == list,
        len(transformations) > allow_x_list_size,
        len(features_top_ns) == len(transformations),
        len(X_names) == len(transformations)
    ]  #
    multiple_x = utils.validate_multiple_conditions(conditions)  #
    for train_index, test_index in indexes:
        if multiple_x:
            X_train, X_test, y_train, y_test = transform_and_join(
                X,
                Y,
                train_index,
                test_index,
                transformations,
                features_top_ns,
                iclf,
                iclfk,
                joint_transformation=None,
                vectors=vectors)
            report_name_sufix_xs = [
                xn + "_" + tr + "_" + str(feats) for xn, tr, feats in zip(
                    X_names, transformations, features_top_ns)
            ]
            report_name_sufix_xs = " & ".join(report_name_sufix_xs)
        else:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
        total_features = X_train.shape[-1]
        number_of_features = total_features
        if type(transform) == str:
            report_name_sufix = report_name_sufix_xs + "_" + transform
            number_of_features = 1
            X_train, X_test, y_train, y_test = transform_x_train_test(
                X_train,
                X_test,
                y_train,
                y_test,
                transform=transform,
                iclf=iclf,
                iclfk=iclfk,
                vector=vector)
        if len(y_test.shape) > 1:
            y_test = y_test.argmax(1)
        if norm == True:
            X_train, X_test = norm_z_score(X_train, X_test)
        start_feature_number = number_of_features
        end_feature_number = total_features + 1
        for feature_number in range(start_feature_number, end_feature_number):
            tmp_scores = output_metrics.get('F' + str(feature_number), {})
            X_train_ = X_train[:, :feature_number]
            X_test_ = X_test[:, :feature_number]
            tmp_scores = fit_and_get_metrics(X_train_, X_test_, y_train,
                                             y_test, iclf, iclfk,
                                             report_metrics, tmp_scores)
            output_metrics['F' + str(feature_number)] = tmp_scores
    if calc_stats == True:
        for fn in range(number_of_features, total_features + 1):
            fk = 'F' + str(fn)
            metrics = output_metrics[fk]
            metrics_report = {
                'Name': report_name + report_name_sufix,
                'Number of Variables': fn
            }
            for m in metrics.keys():
                stats = metrics_stats(metrics[m], rn=3)
                for sk in stats.keys():
                    metrics_report[m + "_" + sk] = stats[sk]
            stats_df.append(metrics_report)
        stats_df = pd.DataFrame(stats_df).sort_values(
            by=[sort_metric, 'Number of Variables'],
            ascending=[False, True]).reset_index(drop=True)
    return output_metrics.copy(), stats_df.copy()
Exemple #23
0
def main():
    #proccess start time
    tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')    
    # delete files in the processing folder
    clear_processing()
    #convert .mrc to MARC/XML
    Marc_XML = MARC_XML()
    Marc_XML.convert_marc_xml()
    BIBFRAME = BIB_builder()
    BIBFRAME.merger()
    folder = 'Processing'
    #iterate over BIBFRAME files
    for files in os.listdir(folder):
        tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        file = os.path.join(folder, files)
        filename = files.replace('.xml', '')
        print ("processing " + filename)
        ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        log_file = str(filename) + "-error-logs"
        print (log_file)
        output = str(filename) + "-enhanced.xml" 
        clearLogs(log_file, filename)
        # all the APIs that will be searched - for a new API, add a new method to SearchAPI class and call it with adding a staticmethod to APIFactory
        apis = ['search_api_LC', 'search_api_LCS', 'search_api_VF', 'search_api_VFP', 'search_api_VFC']
        #this is needed for LC APIs
        query_type = "/authorities/names"
        # extracting names and titles from BIBFRAME
        bib_object = Bibframe(file, log_file)
        transformed = bib_object.convert_bibframe()
        names = bib_object.extract_names(transformed)[0]
        titles = bib_object.extract_names(transformed)[1]
        #getting corp names (for stat report)
        all_names = bib_object.extract_names(transformed)[2]
        corp_names = bib_object.extract_names(transformed)[3]
        print (str(all_names) + " names were extrected from " + filename)
        print (str(len(names)) + " unique names were extracted from " + filename + " --- " + str(len(names) - corp_names) + " Personal names and " + str(corp_names) + " Corporate names")
        print (str(len(titles)) + " titles were extracted from " + filename)
        #dictionaries for storing URIs (names and titles) and stats
        enriched_names = {}
        enriched_titles = {}
        stats = {}
        print ("enriching names")
        # iterate over the name dictionary 
        for index, item in enumerate(names.keys()):
            name = item.split('-_-_-')[0]
            print(index+1, name)
            enriched_names[item] = []
            for api in apis:
                #check if the stat for the API already exists
                if api in stats.keys():
                    pass
                else:
                    stats[api] = 0
                # getting the API method
                name_result = APIFactory().get_API(name, query_type, api, log_file)
                # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key
                if name_result:
                    enriched_names[item].append(name_result)
                    # add number of results to be used latter in stats report
                    stats[api] = stats[api] + len(name_result)
        print ("enriching titles")
        # iterate over the title dictionary
        for index, title in enumerate(titles.keys()):
            print(index+1, title)
            for authors in titles[title]['authors']:
                author =  authors.split('-_-_-')[0]
                key = str(author) + "-_-_-" + str(title)
                enriched_titles[key] = []
                title_result = APIFactory().get_API(author, title, 'search_OCLC', log_file)
                if title_result:
                    enriched_titles[key].append(title_result)
        # getting rid of unwanted things
        name_results = clean_up(enriched_names)
        title_result = clean_up(enriched_titles)
        # get the best URI each API (highest score) and storing it in final_names and final_titles
        result_names_Object = Results(name_results, names, file, 'name', log_file)
        result_names_Object.maximizer()
        final_names = result_names_Object.mapping()
        result_title_Object = Results(title_result, titles, file, 'title', log_file)
        final_titles = result_title_Object.mapping()
        eff = get_stat(final_names, len(names), final_titles, len(titles), filename)
        stats['names-enriched'] = len(final_names)
        tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        #write back the URIs to the BIBFRAME file
        write(final_names, final_titles, file, output, log_file, filename)
        tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
        write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tff, '%H:%M:%S')
        file_process_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tfs, '%H:%M:%S')
        write_stats(eff, stats, filename, len(titles), len(names), all_names, corp_names, file_process_time, write_time)
        #removing temp-file.xml
        delete_temp()
        print(filename + " processed in: ", file_process_time, " --- writing process :", write_time)
    tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime(tps, '%H:%M:%S')
    print("walltime:", process_time)
Exemple #24
0
    return {'mean': mean, 'stddev': stddev}


stats = {}

correspondences = correspondences[::-1]

n_groups = len(correspondences)

table = PrettyTable()
table.field_names = [
    "Sensor", "Old mean", "New mean", "% change", "Old stddev", "New stddev"
]

for c in correspondences:
    if c not in stats.keys():
        stats[c] = {}

    stats[c]['new'] = getStats(*new[c].T)
    stats[c]['old'] = getStats(*old[c].T)
    stats[c]['combined'] = getStats(*np.vstack([new[c], old[c]]).T)

correspondences = sorted(correspondences,
                         key=lambda c: 100 * (stats[c]['new']['mean'] - stats[
                             c]['old']['mean']) / stats[c]['old']['mean'])

for c in correspondences:
    table.add_row([
        c,
        round(stats[c]['old']['mean'], 3),
        round(stats[c]['new']['mean'], 3),
Exemple #25
0
def marc_process(processing_files, apis):
    #proccess start time
    tps = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')   
    db_update_obj = P_progress(pid=processing_files)
    db_update_obj.save()
    file = "Webapp/source/%s" %(processing_files.name) 
    #convert .mrc to MARC/XML
    Marc_XML = MARC_XML(file)
    files = Marc_XML.convert_marc_xml(db_update_obj)
    tfs = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    file = os.path.join('', files)
    filename = files.replace('.xml', '').replace('Webapp/Files/Processing/', '')
    print ("processing " + filename)
    ts = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    log_file = str(filename) + "-error-logs"
    output = str(filename) + "-enhanced.xml" 
    clearLogs(log_file, filename)
    query_type = "/authorities/names"
    # extracting names and titles from BIBFRAME
    db_update_obj.stage = "Extracting_names_and_titles"
    db_update_obj.save()
    mergecheck = True
    bib_object = Bibframe(file, log_file, mergecheck)
    transformed = bib_object.convert_bibframe()
    names = bib_object.extract_names(transformed)[0]
    titles = bib_object.extract_names(transformed)[1]
    #getting corp names (for stat report)
    all_names = bib_object.extract_names(transformed)[2]
    corp_names = bib_object.extract_names(transformed)[3]
    print(str(all_names) + " names were extrected from " + filename)
    print(str(len(names)) + " unique names were extracted from " + filename + " --- " + str(len(names) - corp_names) + " Personal names and " + str(corp_names) + " Corporate names")
    print(str(len(titles)) + " titles were extracted from " + filename)
    db_update_obj.all_names = str(len(names))
    db_update_obj.all_titles = str(len(titles))
    db_update_obj.p_names = str(len(names) - corp_names)
    db_update_obj.c_names=str(corp_names)
    db_update_obj.save()
    #dictionaries for storing URIs (names and titles) and stats
    enriched_names = {}
    enriched_titles = {}
    stats = {}
    print ("enriching names")
    # iterate over the name dictionary 
    db_update_obj.stage = "Enriching_names"
    db_update_obj.save()
    for index, item in enumerate(names.keys()):
        db_update_obj.name_index = index+1
        db_update_obj.save()
        name = item.split('-_-_-')[0]
        #print(index+1, name)
        enriched_names[item] = []
        for api in apis:
            #check if the stat for the API already exists
            if api in stats.keys():
                pass
            else:
                stats[api] = 0
            # getting the API method
            name_result = APIFactory().get_API(name, query_type, api, log_file)
            # if the results are not empty, append to "enriched_names" dictionary the result using the api name as key
            if name_result:
                enriched_names[item].append(name_result)
                # add number of results to be used latter in stats report
                stats[api] = stats[api] + len(name_result)
    print ("enriching titles")
    # iterate over the title dictionary
    db_update_obj.stage = "Enriching_titles"
    db_update_obj.save()
    for index, title in enumerate(titles.keys()):
        db_update_obj.title_index = index+1
        db_update_obj.save()
        #print(index+1, title)
        for authors in titles[title]['authors']:
            author =  authors.split('-_-_-')[0]
            key = str(author) + "-_-_-" + str(title)
            enriched_titles[key] = []
            title_result = APIFactory().get_API(author, title, 'search_OCLC', log_file)
            if title_result:
                enriched_titles[key].append(title_result)
    # getting rid of unwanted things
    db_update_obj.stage = "Optimization"
    db_update_obj.save()
    name_results = clean_up(enriched_names)
    title_result = clean_up(enriched_titles)
    # get the best URI each API (highest score) and storing it in final_names and final_titles
    result_names_Object = Results(name_results, names, file, 'name', log_file)
    result_names_Object.maximizer()
    final_names = result_names_Object.mapping()
    result_title_Object = Results(title_result, titles, file, 'title', log_file)
    final_titles = result_title_Object.mapping()
    tff = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    #write back the URIs to the BIBFRAME file
    db_update_obj.stage = "Writing_to_BIBFRAME"
    db_update_obj.save()
    write(final_names, final_titles, file, output, log_file, filename)
    eff = get_stat(final_names, len(names), final_titles, len(titles), filename)
    stats['names-enriched'] = len(final_names)
    tfw = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    write_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tff, '%H:%M:%S')
    file_process_time = datetime.strptime(tfw, '%H:%M:%S') - datetime.strptime(tfs, '%H:%M:%S')
    write_stats(eff, stats, filename, len(titles), len(names), all_names, corp_names, file_process_time, write_time, db_update_obj)
    #removing temp-file.xml
    delete_temp()
    print(filename + " processed in: ", file_process_time, " --- writing process :", write_time)
    tpf = datetime.fromtimestamp(time.time()).strftime('%H:%M:%S')
    process_time = datetime.strptime(tpf, '%H:%M:%S') - datetime.strptime(tps, '%H:%M:%S')
    db_update_obj.stage = "The process was completed in %s" %(process_time)
    db_update_obj.save()
    print("walltime:", process_time)
    add_to_archive(processing_files, db_update_obj, len(final_names), len(final_titles))
Exemple #26
0
def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05):
    """Test equality of variance.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`, list or dict
        Iterable. Can be either a list / dictionnary of iterables
        or a wide- or long-format pandas dataframe.
    dv : str
        Dependent variable (only when ``data`` is a long-format dataframe).
    group : str
        Grouping variable (only when ``data`` is a long-format dataframe).
    method : str
        Statistical test. `'levene'` (default) performs the Levene test
        using :py:func:`scipy.stats.levene`, and `'bartlett'` performs the
        Bartlett test using :py:func:`scipy.stats.bartlett`.
        The former is more robust to departure from normality.
    alpha : float
        Significance level.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W/T'``: Test statistic ('W' for Levene, 'T' for Bartlett)
        * ``'pval'``: p-value
        * ``'equal_var'``: True if ``data`` has equal variance

    See Also
    --------
    normality : Univariate normality test.
    sphericity : Mauchly's test for sphericity.

    Notes
    -----
    The **Bartlett** :math:`T` statistic [1]_ is defined as:

    .. math::

        T = \\frac{(N-k) \\ln{s^{2}_{p}} - \\sum_{i=1}^{k}(N_{i} - 1)
        \\ln{s^{2}_{i}}}{1 + (1/(3(k-1)))((\\sum_{i=1}^{k}{1/(N_{i} - 1))}
        - 1/(N-k))}

    where :math:`s_i^2` is the variance of the :math:`i^{th}` group,
    :math:`N` is the total sample size, :math:`N_i` is the sample size of the
    :math:`i^{th}` group, :math:`k` is the number of groups,
    and :math:`s_p^2` is the pooled variance.

    The pooled variance is a weighted average of the group variances and is
    defined as:

    .. math:: s^{2}_{p} = \\sum_{i=1}^{k}(N_{i} - 1)s^{2}_{i}/(N-k)

    The p-value is then computed using a chi-square distribution:

    .. math:: T \\sim \\chi^2(k-1)

    The **Levene** :math:`W` statistic [2]_ is defined as:

    .. math::

        W = \\frac{(N-k)} {(k-1)}
        \\frac{\\sum_{i=1}^{k}N_{i}(\\overline{Z}_{i.}-\\overline{Z})^{2} }
        {\\sum_{i=1}^{k}\\sum_{j=1}^{N_i}(Z_{ij}-\\overline{Z}_{i.})^{2} }

    where :math:`Z_{ij} = |Y_{ij} - \\text{median}({Y}_{i.})|`,
    :math:`\\overline{Z}_{i.}` are the group means of :math:`Z_{ij}` and
    :math:`\\overline{Z}` is the grand mean of :math:`Z_{ij}`.

    The p-value is then computed using a F-distribution:

    .. math:: W \\sim F(k-1, N-k)

    .. warning:: Missing values are not supported for this function.
        Make sure to remove them before using the
        :py:meth:`pandas.DataFrame.dropna` or :py:func:`pingouin.remove_na`
        functions.

    References
    ----------
    .. [1] Bartlett, M. S. (1937). Properties of sufficiency and statistical
           tests. Proc. R. Soc. Lond. A, 160(901), 268-282.

    .. [2] Brown, M. B., & Forsythe, A. B. (1974). Robust tests for the
           equality of variances. Journal of the American Statistical
           Association, 69(346), 364-367.

    Examples
    --------
    1. Levene test on a wide-format dataframe

    >>> import numpy as np
    >>> import pingouin as pg
    >>> data = pg.read_dataset('mediation')
    >>> pg.homoscedasticity(data[['X', 'Y', 'M']])
                   W      pval  equal_var
    levene  0.434861  0.999997       True

    2. Bartlett test using a list of iterables

    >>> data = [[4, 8, 9, 20, 14], np.array([5, 8, 15, 45, 12])]
    >>> pg.homoscedasticity(data, method="bartlett", alpha=.05)
                     T      pval  equal_var
    bartlett  2.873569  0.090045       True

    3. Long-format dataframe

    >>> data = pg.read_dataset('rm_anova2')
    >>> pg.homoscedasticity(data, dv='Performance', group='Time')
                   W      pval  equal_var
    levene  3.192197  0.079217       True
    """
    assert isinstance(data, (pd.DataFrame, list, dict))
    assert method.lower() in ['levene', 'bartlett']
    func = getattr(scipy.stats, method)
    if isinstance(data, pd.DataFrame):
        # Data is a Pandas DataFrame
        if dv is None and group is None:
            # Wide-format
            # Get numeric data only
            numdata = data._get_numeric_data()
            assert numdata.shape[1] > 1, 'Data must have at least two columns.'
            statistic, p = func(*numdata.to_numpy())
        else:
            # Long-format
            assert group in data.columns
            assert dv in data.columns
            grp = data.groupby(group)[dv]
            assert grp.ngroups > 1, 'Data must have at least two columns.'
            statistic, p = func(*grp.apply(list))
    elif isinstance(data, list):
        # Check that list contains other list or np.ndarray
        assert all(isinstance(el, (list, np.ndarray)) for el in data)
        assert len(data) > 1, 'Data must have at least two iterables.'
        statistic, p = func(*data)
    else:
        # Data is a dict
        assert all(isinstance(el, (list, np.ndarray)) for el in data.values())
        assert len(data) > 1, 'Data must have at least two iterables.'
        statistic, p = func(*data.values())

    equal_var = True if p > alpha else False
    stat_name = 'W' if method.lower() == 'levene' else 'T'

    stats = {stat_name: statistic, 'pval': p, 'equal_var': equal_var}

    return pd.DataFrame(stats, columns=stats.keys(), index=[method])
Exemple #27
0
def write_fasta_by_location(peaks):
    seqs = {}
    stats = {}
    for index, row in peaks.iterrows():
        seqs.setdefault(row['location'], '')
        seqs[row['location']] += '\n>{i}_{g}\n{s}'.format(
            i=index, g=row['gene_name'], s=row['seq']
        )
        stats.setdefault(row['gene_name'], {'locations': []})
        stats[row['gene_name']]['locations'].append(row['location'])
    fname = {
        """3'UTR""": '3prime_utr.fa',
        """5'UTR""": '5prime_utr.fa',
        """CDS""": 'cds.fa',
        """ncRNA""": 'ncRNA.fa'
    }
    for label in seqs:
        with open('data/fasta/{f}'.format(f=fname[label]), 'w') as fh:
            fh.write(seqs[label])
    five_and_three = set()
    cds_and_three = set()
    five_cds_and_three = set()
    only_three = set()
    only_cds = set()
    only_five = set()
    has_secondary = set()
    for gene in stats:
        if ("""3'UTR""" in stats[gene]['locations']) and (
            """5'UTR""" in stats[gene]['locations']):
            five_and_three.add(gene)
        if ("""3'UTR""" in stats[gene]['locations']) and (
            """CDS""" in stats[gene]['locations']):
            cds_and_three.add(gene)
        if ("""3'UTR""" in stats[gene]['locations']) and (
            """5'UTR""" in stats[gene]['locations']) and (
            """CDS""" in stats[gene]['locations']
        ):
            five_cds_and_three.add(gene)
        if set(stats[gene]['locations']) == set(["3'UTR"]):
            only_three.add(gene)
        if set(stats[gene]['locations']) == set(["5'UTR"]):
            only_five.add(gene)
        if set(stats[gene]['locations']) == set(["CDS"]):
            only_cds.add(gene)
        if len(stats[gene]['locations']) > 1:
            has_secondary.add(gene)
    print """
    Number of genes: {total}
    Number of genes with only 3' peaks: {o_3}
    Number of genes with only CDS peaks: {o_cds}
    Number of genes with only 5' peaks: {o_5}
    Number of genes with 3' and 5' peaks: {a}
    Number of genes with 3' and CDS peaks: {b}
    Number of genes with 5' and 3' and CDS peaks: {c}
    Number of genes with a secondary peak: {has_second}
    """.format(
        total=len(list(set(stats.keys()))),
        o_3=len(list(only_three)),
        o_cds=len(list(only_cds)),
        o_5=len(list(only_five)),
        a=len(list(five_and_three)),
        b=len(list(cds_and_three)),
        c=len(list(five_cds_and_three)),
        has_second=len(list(has_secondary))
    )
    labels = ['ncRNA', """3'UTR""", """5'UTR""", 'CDS']