def _update_subtree_sec(self, item, data): """ Add a new subtree to the current QTreeWidgetItem. """ if not isinstance(data, dict): for s in self.similar_items: sitem = QTreeWidgetItem([None, s]) sitem.setToolTip(1, s) item.addChild(sitem) if not isinstance(data, self.noPrintTypes): for c in range(item.childCount()): item.child(c).setCheckState(0, Qt.Unchecked) self.checkableItems.append(item.child(c)) else: for n, k in enumerate(realsorted(data.keys(), key=_lowercase)): item.addChild(QTreeWidgetItem([None, k])) child = item.child(n) if isinstance(data[k], dict): self._update_subtree(child, data[k]) else: for s in self.similar_items: sitem = QTreeWidgetItem([None, s]) sitem.setToolTip(0, s) child.addChild(sitem) if not isinstance(data[k], self.noPrintTypes): for c in range(child.childCount()): child.child(c).setCheckState(0, Qt.Unchecked) self.checkableItems.append(child.child(c))
def _make_groups(self, trajectory_categories, sort_category): r"""Groups the sample ids in `self._metadata_map` by the values in `trajectory_categories` Creates `self._groups`, a dictionary keyed by category and values are dictionaries in which the keys represent the group name within the category and values are ordered lists of sample ids If `sort_category` is not None, the sample ids are sorted based on the values under this category in the metadata map. Otherwise, they are sorted using the sample id. Parameters ---------- trajectory_categories : list of str A list of metadata categories to use to create the groups. Default: None, compute all of them sort_category : str or None The category from self._metadata_map to use to sort groups """ # If sort_category is provided, we used the value of such category to # sort. Otherwise, we use the sample id. if sort_category: def sort_val(sid): return self._metadata_map[sort_category][sid] else: def sort_val(sid): return sid self._groups = defaultdict(dict) for cat in trajectory_categories: # Group samples by category gb = self._metadata_map.groupby(cat) for g, df in gb: self._groups[cat][g] = realsorted(df.index, key=sort_val)
def _update_subtree(self, item, data): """ Add a new subtree to the current QTreeWidgetItem. """ for n, k in enumerate(realsorted(data.keys(), key=_lowercase)): item.addChild(QTreeWidgetItem([None, k])) child = item.child(n) if isinstance(data[k], dict): self._update_subtree(child, data[k]) elif not isinstance(data[k], self.noPrintTypes): child.setCheckState(0, Qt.Unchecked) self.checkableItems.append(child)
def generate_database_lines(data): lines = [] for dat in data: s = '%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % ( dat['cid'], dat['CAS'], dat['formula'], round(dat['MW'], 9), dat['smiles'], dat['inchi'], dat['inchikey'], dat['name']) s += '\t'.join(dat['synonyms']) s += '\n' lines.append(s) lines = realsorted(lines) return lines
def calc_summary_stats(output_info, cutoff): """Calculate average read depth, number of peaks, standard deviation and report each peak for each msi range in the bed file """ sites={} #msi_info is all loci for this chromosome for name, info in output_info.items(): #Set total average depth for this site if info['total_depth']!= 0 and info['total_sites'] != 0: #Use ceil to round up average_depth=ceil(float(info['total_depth'])/info['total_sites']) #Turn to int average_depth=int(average_depth) else: average_depth=0 if average_depth != 0 and info['total_mutant_depth'] <= average_depth: wildtype_ave_depth=int(average_depth-info['total_mutant_depth']) wildtype_fraction=float(wildtype_ave_depth)/average_depth else: wildtype_fraction, wildtype_ave_depth=0,0 sites[0]='0:0:0' if info['indels']: highest_frac = calc_highest_peak(info['indels'], wildtype_fraction, average_depth) sites=calc_wildtype(list(info['indels'].keys()), wildtype_ave_depth, wildtype_fraction, highest_frac) num_peaks, peaks=calc_number_peaks(info['indels'], sites, highest_frac, cutoff) stdev=calc_std_peaks(peaks.values()) #Sort the peak list naturally (-3,-2,-1,0,1,2,3) peak_list=(" ").join(str(x) for x in natsort.realsorted(peaks.values())) elif average_depth !=0: #if there are no indels, but there are wild type reads wildtype_fraction=1 sites[0]=(":").join(['0', str(float(wildtype_fraction)), str(wildtype_ave_depth)]) num_peaks=1 peak_list=sites[0] stdev=0 else: #if there are no reads at this site wildtype_fraction=0 sites[0]=(":").join(['0', str(float(wildtype_fraction)), str(wildtype_ave_depth)]) num_peaks=0 peak_list=sites[0] stdev=0 output_info[name]={'Name':info['Name'], 'Average_Depth':average_depth, 'Standard_Deviation':stdev, 'Number_of_Peaks':num_peaks, 'IndelLength:AlleleFraction:SupportingCalls':peak_list} return output_info
def rsort(strings_to_sort): strs = [] for arg in strings_to_sort: if '*' in arg: res = glob.glob(arg) else: # TODO: check if file res = arg strs.append(res) strs = natsort.realsorted(flatten_list(strs)) for string in strs: print(string) return strs
def _dlg_combine(self): """ Open a dialog to combine the dataset. """ trace = self._get_obj_trace(self.datatree.current_item()) data = self.get(trace) keys = realsorted(data, key=lambda x: x.lower()) d0 = data.get(keys[0]) npt = self.noPrintTypes + (dict,) # Search for occurences of arrays with the same shape as the first one if isinstance(d0, npt): n_keys = [k for k in keys if isinstance(data.get(k), type(d0))] data_shape = () else: n_keys = [] for k in keys: if not isinstance(data.get(k), npt): if data.get(k).shape == d0.shape: n_keys.append(k) data_shape = data.get(n_keys[0]).shape # Show a dialog asking if the conversion should be done. if len(data_shape) > 1: txt = "Combine the first {} datasets of {} element(s) into one?" txt = txt.format(len(n_keys), data_shape) else: txt = "Combine {} elements into 1D vector?".format(len(n_keys)) btns = (QMessageBox.Yes|QMessageBox.No) msg = QMessageBox(QMessageBox.Information, "Info", txt, buttons=btns) msg.setDefaultButton(QMessageBox.Yes) if msg.exec_() != QMessageBox.Yes: return # Add 'combined' if not all values are combined or it is a topLevelItem if len(n_keys) != len(keys) or len(trace) == 1: trace.append('combined') # Perform the combination try: self.set_data(trace, np.array([data.get(k) for k in n_keys])) except ValueError: # For h5py dictionaries self.set_data(trace, np.array([data.get(k)[()] for k in n_keys])) # Remove the combined data if len(n_keys) != len(keys) and len(data_shape) > 1: _ = [self.get(trace[:-1]).pop(key) for key in n_keys] # Put new dimension at the end and remove singleton dimensions. self.set_data(trace, np.moveaxis(self.get(trace), 0, -1).squeeze()) self.datatree.update_tree()
def __sort_data_by_label_order(self): ''' Uses the natsort package's realsorted to sort strings such that substrings that are numeric values are sorted in numeric order, and non-numeric substrings sorted lexigraphically. ''' ls_natsorted_labels = realsorted(self.current_data['labels']) l_idx_labels_sorted=[ self.current_data[ 'labels' ].index( s_label ) \ for s_label in ls_natsorted_labels ] ls_sorted_labels = [ self.current_data['labels'][idx] for idx in l_idx_labels_sorted ] ls_sorted_value_lists = [ self.current_data['value_lists'][idx] for idx in l_idx_labels_sorted ] self.current_data['labels'] = ls_sorted_labels self.current_data['value_lists'] = ls_sorted_value_lists return
def get_articles_files(): return realsorted(glob.glob("{}/**/*.md".format(INPUT_DIRECTORY)))
""" Spyder Editor This is a temporary script file. """ import numpy as np, pandas as pd import matplotlib.pyplot as plt from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator) import re import matplotlib.ticker as ticker import glob import natsort f1 = glob.glob('u*.txt') f1 = natsort.realsorted(f1) f2 = glob.glob('head&tail.txt') f2 = natsort.realsorted(f2) def readData(filename): d = pd.read_csv(filename, delim_whitespace=True, header=None) d = np.asarray(d) return d dSlug = readData(f2[0]) lSlug = dSlug[:, 1] - dSlug[:, 0] meanSlug = np.mean(lSlug) stdSlug = np.std(lSlug) t = np.arange(len(lSlug))
def test_realsorted_is_identical_to_natsorted_with_real_alg(float_list): assert realsorted(float_list) == natsorted(float_list, alg=ns.REAL)
humansorted(a) a = ['Apple', 'corn', 'Corn', 'Banana', 'apple', 'banana'] natsorted(a) natsorted(a, alg=ns.IGNORECASE) natsorted(a, alg=ns.LOWERCASEFIRST) natsorted(a, alg=ns.GROUPLETTERS) natsorted(a, alg=ns.G | ns.LF) a = ['a50', 'a51.', 'a+50.4', 'a5.034e1', 'a+50.300'] natsorted(a, alg=ns.FLOAT) natsorted(a, alg=ns.FLOAT | ns.SIGNED) natsorted(a, alg=ns.FLOAT | ns.SIGNED | ns.NOEXP) natsorted(a, alg=ns.REAL) from natsort import realsorted realsorted(a) from operator import attrgetter, itemgetter a = [['a', 'num4'], ['b', 'num8'], ['c', 'num2']] natsorted(a, key=itemgetter(1)) class Foo: def __init__(self, bar): self.bar = bar def __repr__(self): return "Foo('{0}')".format(self.bar) b = [Foo('num3'), Foo('num5'), Foo('num2')]
def df_read_filecols(df, filecols, *, order_sites=True): """Merges data frame with entries read from files. Designed to expand data frame listing CSV files with site or mutation-level selection information into a data frame listing the information in these CSV files. Args: `df` (pandas DataFrame) Each row gives files and associated information. `filecols` (list) List of columns in `df` that give filenames of CSV files to add to data frame. These CSV files cannot have column names already in `df`. `order_sites` (bool) Expect a `site` column, make it naturally sorted categorical variable, and add `isite` column that numbers sites 0, 1, ... Returns: A data frame where the entries in the files are now read as columns. >>> tf = tempfile.NamedTemporaryFile >>> with tf(mode='w') as sitediffsel1, tf(mode='w') as sitediffsel2, \\ ... tf(mode='w') as mutdiffsel1, tf(mode='w') as mutdiffsel2: ... ... # first sitediffsel file ... _ = sitediffsel1.write('site,sitediffsel\\n' ... '1,3.2\\n' ... '-1,2.3\\n' ... '(HA2)1,0.1') ... sitediffsel1.flush() ... ... # first mutdiffsel file ... _ = mutdiffsel1.write('site,wildtype,mutation,mutdiffsel\\n' ... '-1,A,C,-0.7\\n' ... '-1,A,G,3.0\\n' ... '1,C,A,1.2\\n' ... '1,C,G,2.0\\n' ... '(HA2)1,C,A,0.0\\n' ... '(HA2)1,C,G,0.1') ... mutdiffsel1.flush() ... ... # second sitediffsel file ... _ = sitediffsel2.write('site,sitediffsel\\n' ... '(HA2)1,9.1\\n' ... '1,1.2\\n' ... '-1,0.3\\n') ... sitediffsel2.flush() ... ... # second mutdiffsel file ... _ = mutdiffsel2.write('site,wildtype,mutation,mutdiffsel\\n' ... '-1,A,C,-0.2\\n' ... '-1,A,G,0.5\\n' ... '1,C,A,1.1\\n' ... '1,C,G,0.1\\n' ... '(HA2)1,C,A,9.0\\n' ... '(HA2)1,C,G,0.1') ... mutdiffsel2.flush() ... ... # data frame with files as columns ... df = pandas.DataFrame({ ... 'name':['sample_1', 'sample_2'], ... 'serum':['serum_1', 'serum_1'], ... 'sitediffsel_file':[sitediffsel1.name, sitediffsel2.name], ... 'mutdiffsel_file':[mutdiffsel1.name, mutdiffsel2.name] ... }) ... ... # call df_read_filecols ... (df_read_filecols(df, ['sitediffsel_file', 'mutdiffsel_file']) ... .drop(columns=['sitediffsel_file', 'mutdiffsel_file'])) name serum site sitediffsel wildtype mutation mutdiffsel isite 0 sample_1 serum_1 1 3.2 C A 1.2 1 1 sample_1 serum_1 1 3.2 C G 2.0 1 2 sample_1 serum_1 -1 2.3 A C -0.7 0 3 sample_1 serum_1 -1 2.3 A G 3.0 0 4 sample_1 serum_1 (HA2)1 0.1 C A 0.0 2 5 sample_1 serum_1 (HA2)1 0.1 C G 0.1 2 6 sample_2 serum_1 (HA2)1 9.1 C A 9.0 2 7 sample_2 serum_1 (HA2)1 9.1 C G 0.1 2 8 sample_2 serum_1 1 1.2 C A 1.1 1 9 sample_2 serum_1 1 1.2 C G 0.1 1 10 sample_2 serum_1 -1 0.3 A C -0.2 0 11 sample_2 serum_1 -1 0.3 A G 0.5 0 """ if not len(df): raise ValueError('`df` has no rows') df_cols = set(df.columns) if 'dummy' in df_cols: raise ValueError('`df` has column named "dummy"') if not (set(filecols) <= df_cols): raise ValueError('`df` does not have all the `filecol` columns') df_filecols = [] for row in df.iterrows(): # get data frame of just row, with a dummy column for merging row_df = row[1].to_frame().transpose().assign(dummy=1) for col in filecols: filename = row_df.at[row[0], col] file_df = pandas.read_csv(filename).assign(dummy=1) if order_sites and 'site' not in file_df.columns: raise ValueError(f"no `site` column in {filename}") sharedcols = set(file_df.columns).intersection(df_cols) if sharedcols: raise ValueError(f"`df` and {filename} share columns " f"{sharedcols}") row_df = row_df.merge(file_df) df_filecols.append(row_df) df_filecols = (pandas.concat(df_filecols, ignore_index=True).drop('dummy', axis='columns')) if order_sites: sites = natsort.realsorted(df_filecols['site'].unique()) df_filecols = (df_filecols.assign( site=lambda x: pandas.Categorical(x['site'], sites, ordered=True), isite=lambda x: x['site'].cat.codes)) return df_filecols
u1 = u1[s ^ 1::2] w1 = w1[s ^ 1::2] return u0, w0, u1, w1 def get_token(f, token): return f.split(token)[-1].split('_')[0] def main(f): if '_F' in f: om, al = [get_token(f, token) for token in ['B', 'F']] else: om, al = [get_token(f, token) for token in ['_o', '_a']] u0, w0, u1, w1 = get_data(f) figure(1, figsize=(8, 8)) clf() plot(u0, w0, 'o', mec='none', mfc='k', ms=1) plot(u1, w1, 'o', mec='none', mfc='r', ms=1) xlabel(r'$u$') ylabel(r'$w$') title(r'$(\omega,\alpha)=$' + f'({om:s},{al:s})', y=1.10) savefig(f'fig/space_time_monitor/o{om:s}_a{al:s}.png') print(f'plotted: {f:s}') return None G = natsort.realsorted(glob.glob(sys.argv[1])) for g in G: main(g)
from pathlib import Path import matplotlib.pyplot as plt import numpy as np from gesture.config import * from natsort import natsorted,realsorted from common_plot import barplot_annotate_brackets top5_sid=[4,10,13,29,41] depths=[1,2,3,4,5,6] data_dir = '/Users/long/Documents/data/gesture/'# temp data dir training_result_dir=data_dir+'training_result/dl_depth/' accuracy_all=[] for sid in top5_sid: sid_acc=[] tmp = realsorted([pth for pth in Path(training_result_dir+str(sid)).iterdir() if pth.suffix == '.npy' and 'changeDepth' in str(pth)]) for depth in depths: result = np.load(str(tmp[depth-1]), allow_pickle=True).item() sid_acc.append(result['test_acc']) accuracy_all.append(sid_acc) # perform best at depth=1 from matplotlib.patches import Patch colors=['orangered','yellow', 'gold','orange','springgreen','aquamarine']#,'skyblue'] depth_label=[(str(i)+' layer') if i==1 else (str(i)+' layers') for i in depths] cmap = dict(zip([str(i) for i in depth_label], colors)) patches = [Patch(color=v, label=k) for k, v in cmap.items()] fig,ax=plt.subplots() x=[1,2,3,4,5,6] # 6 depths #BUG accuracy_all_bug=np.asarray(accuracy_all)
def test_realsorted_returns_results_identical_to_natsorted_with_REAL(): a = ['a50', 'a51.', 'a50.31', 'a-50', 'a50.4', 'a5.034e1', 'a50.300'] assert realsorted(a) == natsorted(a, alg=ns.REAL)
def comparePrefs(prefs1, prefs2, sites=None, distmetric='half_sum_abs_diff', chars=dms_tools2.AAS): """Compute error-corrected distance between two sets of preferences. Designed for the situation in which you have made replicate measurements of the amino-acid preferences for two protein homologs, and want to estimate the difference in preferences at each site while correcting for experimental error as quantified by the replicate measurements. The *distance* between each pair of replicates at each site is computed using `prefDistance` with `distmetric`. We then compute the RMS distance between all pairs for the same homolog to get `RMSDwithin`, and all pairs of different homologs to get `RMSDbetween`. We calculate `RMSDcorrected` as `RMSDbetween - RMSDwithin`. We also compute the mean (across replicates) preference for homolog 1 minus the mean for homolog 2, scaled so that the total height in each direction equals `RMSDcorrected`. These values are an error-corrected estimate of the difference in preference for each amino acid between homologs. Args: `prefs1` (list) Files giving replicate measurements of preferences for homolog 1 in the CSV format returned by ``dms2_prefs``. `prefs2` (list) Files giving measurements for homolog 2. `sites` (list or `None`) If `None`, compare all sites shared between the two homolog preference sites. Otherwise should be a list of the sites to compare. `distmetric` (string) Distance metric to use. Can be any valid option for the argument of the same name to `prefDistance`. `chars` (list) List of characters for which we analyze the preferences. For instance, all 20 amino acids. Returns: A `pandas.DataFrame` giving the distances at each site, as well as the replicate mean difference between preferences for homolog 1 minus homolog 2 for each amino acid at each site scaled to height of `RMSDcorrected` in each direction. Example calculation for two character sequences and two replicates for each homolog: >>> TF = functools.partial(tempfile.NamedTemporaryFile, mode='w') >>> with TF() as p1_1, TF() as p1_2, TF() as p2_1, TF() as p2_2: ... n = p1_1.write('''site, A, C ... 1, 0.8, 0.2 ... 2, 0.3, 0.7'''.replace(' ', '')) ... p1_1.flush() ... n = p1_2.write('''site, A, C ... 1, 0.8, 0.2 ... 2, 0.4, 0.6'''.replace(' ', '')) ... p1_2.flush() ... n = p2_1.write('''site, A, C ... 2, 0.4, 0.6 ... 1, 0.6, 0.4'''.replace(' ', '')) ... p2_1.flush() ... n = p2_2.write('''site, A, C ... 1, 0.6, 0.4 ... 1a, 0.4, 0.6 ... 2, 0.5, 0.5'''.replace(' ', '')) ... p2_2.flush() ... diffs = comparePrefs([p1_1.name, p1_2.name], ... [p2_1.name, p2_2.name], ... chars=['A', 'C']) >>> print(diffs.to_string(float_format=lambda x: '{0:.2f}'.format(x))) site RMSDcorrected RMSDbetween RMSDwithin A C 0 1 0.20 0.20 0.00 0.20 -0.20 1 2 0.02 0.12 0.10 -0.02 0.02 """ assert len(prefs1) > 1, "provide prefs for multiple replicates" assert len(prefs2) > 1, "provide prefs for multiple replicates" # read in all preferences prefs = [] expectcols = ['site'] + chars for (homolog, homologprefs) in enumerate([prefs1, prefs2], 1): for (rep, repprefs) in enumerate(homologprefs, 1): iprefs = pandas.read_csv(repprefs) iprefs['site'] = iprefs['site'].astype('str') assert set(iprefs.columns) <= set(expectcols), \ "{0} missing expected columns".format(repprefs) prefs.append(iprefs[expectcols].assign(homolog=homolog, replicate=rep)) # get only desired sites if sites is None: # use sites shared among all preference sets sites = list(set.intersection(*[set(p['site']) for p in prefs])) assert isinstance(sites, list) and len(sites), "no `sites` to analyze" sites = natsort.realsorted(list(map(str, sites))) # merge preferences for desired sites assert all([set(p['site']) >= set(sites) for p in prefs]),\ "not all prefs have all sites" prefs = [p[p['site'].isin(sites)] for p in prefs] prefs = pandas.concat(prefs) prefs['site'] = pandas.Categorical(prefs['site'], sites) prefs = prefs.sort_values('site').set_index('site') # compute RMSDs dists = {'within': [], 'between': []} for ((hi, repi), (hj, repj)) in itertools.combinations( [(h, rep) for h in [1, 2] for rep in prefs.query('homolog == @h')['replicate'].unique()], 2): prefsi = (prefs.query('homolog == @hi and replicate == @repi')[chars]) prefsj = (prefs.query('homolog == @hj and replicate == @repj')[chars]) assert prefsi.index.equals(prefsj.index) disttype = {True: 'within', False: 'between'}[hi == hj] dists[disttype].append([ prefDistance(prefsi.loc[r], prefsj.loc[r], distmetric) for r in sites ]) for (disttype, dist) in dists.items(): distseries = (pandas.DataFrame(dist, columns=sites).transpose().apply( computeRMS, axis=1)) prefs['RMSD' + disttype] = distseries prefs['RMSDcorrected'] = prefs['RMSDbetween'] - prefs['RMSDwithin'] rmsds = ['RMSDcorrected', 'RMSDbetween', 'RMSDwithin'] # compute RMSDcorrected-scaled diff between homologs for each pref prefmeans = {} for homolog in [1, 2]: prefmeans[homolog] = (prefs.reset_index().query( 'homolog == @homolog').groupby('site')[chars].mean()) prefs = prefs[~prefs.index.duplicated(keep='first')][rmsds] dprefs = prefmeans[1] - prefmeans[2] # normalize so sums to one in each direction dprefs = dprefs.div(dprefs.abs().sum(axis=1), axis=0).mul(2).fillna(0) dprefs = dprefs.mul(prefs['RMSDcorrected'], axis=0) prefs = prefs.join(dprefs) return prefs[rmsds + chars].reset_index()
def graphInterval(*, data: dict, title: str = None, xLabel: str = None, yLabel: str = None, gridLines: str = "", groupNames: tuple = (), colorIndex: int = None, show: bool = False): _, ax = plt.subplots() if len(groupNames) > 1: groupNames = (":\n".join(reversed(groupNames))) + ":" plt.text(-.015, -.02, s=groupNames, horizontalalignment="right", verticalalignment="top", transform=ax.transAxes) sortedData = natsort.realsorted(data.items(), key=lambda t: t[0]) colorCatX = defaultdict(list) colorCatY = defaultdict(list) for key, value in sortedData: inverseKey = "\n".join(reversed(key.split("\n"))) if colorIndex != None: colorCatX[key.split("\n")[colorIndex]].append(inverseKey) colorCatY[key.split("\n")[colorIndex]].append(value[1]) else: colorCatX[key.split("\n")[0]].append(inverseKey) colorCatY[key.split("\n")[0]].append(value[1]) if key in data: if value[0] != None: plt.scatter(inverseKey, value[0], marker="_", color="black") if value[2] != None: plt.scatter(inverseKey, value[2], marker="_", color="black") if value[0] != None and value[2] != None: plt.plot([inverseKey] * 3, [value[0], value[1], value[2]], linewidth=.85, color="black") if colorIndex != None: for key, _ in colorCatX.items(): plt.scatter(colorCatX[key], colorCatY[key], label=key) plt.legend(loc="best").set_draggable(True) else: for key, _ in colorCatX.items(): plt.scatter(colorCatX[key], colorCatY[key], color="blue") plt.title(title) plt.xlabel(xLabel) plt.ylabel(yLabel) if len(gridLines) == 1: plt.grid(which="major", axis=gridLines) elif gridLines == "xy": plt.grid(which="major", axis="both") plt.tight_layout() if show: plt.show() else: return plt.gcf()
def get_paternNames(f): glob_str = pat_str = f G = glob.glob(pat_str) F = natsort.realsorted(G) return F
def test_realsorted_returns_results_identical_to_natsorted_with_REAL(): a = ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"] assert realsorted(a) == natsorted(a, alg=ns.REAL)
def sort_lines(input_lines): return realsorted(input_lines)
def readAsort(filename): f = glob.glob(filename) f = natsort.realsorted(f) return f
# -*- coding: utf-8 -*- """ Created on Thu Aug 8 16:35:35 2019 @author: c3216945 """ import pyvista as pv import numpy as np import glob import natsort f = glob.glob('*.vtk') f = natsort.realsorted(f) def getSlugUz(filename, sampleSize): # read vtk as unstructured data data = pv.read(filename) aw = data.point_arrays['alpha.water'] pts = data.points # get alpha.water on the middle line aw = aw[(np.abs(pts[:, 0]) <= 1e-4) & (np.abs(pts[:, 1]) <= 1e-4)] pts = pts[(np.abs(pts[:, 0]) <= 1e-4) & (np.abs(pts[:, 1]) <= 1e-4)] # keep index ind = np.arange(len(aw)).reshape(len(aw), 1) n1 = np.hstack((aw.reshape(len(aw), 1), pts)) n1 = np.hstack((n1, ind)) # sort based on z n1 = n1[n1[:, 3].argsort()]
def test_realsorted_returns_results_identical_to_natsorted(): a = ['a50', 'a51.', 'a50.31', 'a50.4', 'a5.034e1', 'a50.300'] assert realsorted(a) == natsorted(a)
def worker_user(params): """ Worker for calculating metrics for one user. Uses only one core. """ (user_idx, output_dir, test_batches, RQ_cap_adjust, for_epoch, verbose, only_rl, algo_feed, algo_frac, merge_sinks) = params save_dir = os.path.join(output_dir, save_dir_tmpl.format(user_idx)) if verbose: print('Working on user_idx: {}'.format(user_idx)) with open(os.path.join(save_dir, 'user_opt_dict.dill'), 'rb') as f: user_opt_dict = dill.load(f) one_user_data = user_data[user_idx] if merge_sinks: if verbose: print('Merged sinks!') one_user_data = RDU.merge_sinks(one_user_data) ret = { 'user_idx': user_idx, 'user_id': one_user_data['user_id'], 'num_other_posts': one_user_data['num_other_posts'], 'num_own_posts': one_user_data['num_user_events'], 'num_followees': one_user_data['num_followees'], 'duration': one_user_data['duration'], 'num_followers': user_opt_dict['num_followers'], 'N': user_opt_dict['N'], 'reward_kind': user_opt_dict['trainer_opts_dict']['reward_kind'], 'for_epoch': for_epoch, 'num_batches': test_batches, 'algo_feed': algo_feed, } window_start, eval_sim_opts = EB.make_real_data_batch_sim_opts( one_user_data=one_user_data, N=user_opt_dict['N'], seed=-1, is_test=True) ret['window_start'] = window_start ret['window_end'] = eval_sim_opts.end_time # The file names are of the form `*/tpprl.ckpt-<num>.meta`. # Hence, the number is interpreted as negative. # So to extract the last checkpoint, we do a sort by real-values and # pick the most negative value. # Also, we drop the `.meta` suffix. if for_epoch < 0: all_chpt_file = glob.glob(os.path.join(save_dir, '*.meta')) if len(all_chpt_file) == 0: if verbose: print('No chpt files found for {}.'.format(user_idx)) return ret chosen_chpt_file = natsort.realsorted(all_chpt_file)[0][:-5] else: chosen_chpt_file = os.path.join(save_dir, 'tpprl.ckpt-{}'.format(for_epoch)) if verbose: print('chosen_chpt_file = ', chosen_chpt_file) ret['chpt_file'] = chosen_chpt_file if not os.path.exists(chosen_chpt_file + '.meta'): ret['error'] = 'File Not Found: {}.'.format(chosen_chpt_file) return ret rl_b_dict = EB.rl_b_dict_from_chpt( # '/NL/crowdjudged/work/rl-broadcast/r_2-sim-opt-fix/train-save-user_idx-218/tpprl.ckpt-898', chosen_chpt_file, one_user_data=one_user_data, window_start=window_start, user_opt_dict=user_opt_dict) sink_ids = one_user_data['sim_opts'].sink_ids if algo_feed: algo_c = user_opt_dict['algo_c'] lifetimes = defaultdict(lambda: (eval_sim_opts.end_time - window_start) / 10.) # algo_feed_args = ES.make_prefs(sink_ids, src_ids, seed=algo_feed_seed, # src_lifetime_dict=lifetimes) algo_feed_args = ES.make_freq_prefs(one_user_data=one_user_data, sink_ids=sink_ids, src_lifetime_dict=lifetimes) rl_b_dict['algo_feed'] = algo_feed rl_b_dict['algo_feed_args'] = algo_feed_args rl_b_dict['algo_c'] = algo_c rl_b_dict['t_min'] = window_start # This is the "K" in top-K K = 1 if 'q' in user_opt_dict: q = user_opt_dict['q'] else: warnings.warn('Setting q manually.') reward_kind = user_opt_dict['trainer_opts_dict']['reward_kind'] if reward_kind == 'r_2_reward': q = 100.0 elif reward_kind == 'top_k_reward': q = 1.0 init_seed = 865 rl_dfs = [] rl_events = [] rl_u_2 = [] for idx in range(test_batches): mgr, exp_b = EB.get_real_data_mgr_chpt_np(rl_b_dict, t_min=window_start, batch_sim_opt=eval_sim_opts, seed=init_seed + idx, with_broadcaster=True) mgr.run_dynamic(max_events=MAX_EVENTS) rl_dfs.append(mgr.get_state().get_dataframe()) rl_events.append(mgr.state.events) # Calculating the u^2 loss c_is = exp_b.get_all_c_is() time_deltas = exp_b.get_all_time_deltas() rl_u_2.append(exp_b.exp_sampler.calc_quad_loss(time_deltas, c_is)) num_tweets = [ RU.num_tweets_of(df, broadcaster_id=eval_sim_opts.src_id) for df in rl_dfs ] capacity_cap, capacity_std = np.mean(num_tweets), np.std(num_tweets) ret['capacity'] = capacity_cap ret['capacity_std'] = capacity_std ret['RL_u_2_mean'] = np.mean(rl_u_2) ret['RL_u_2_std'] = np.std(rl_u_2) if not only_rl: # Figure out what 'q' to use for RQ to get the same number of tweets. # Removing 'RQ_cap_adjust' because RQ systematically tweets more. q_RQ = RU.sweep_q(eval_sim_opts, capacity_cap=capacity_cap - RQ_cap_adjust, verbose=verbose, q_init=q, parallel=False, max_events=MAX_EVENTS, max_iters=MAX_ITERS, only_tol=True, tol=0.1) ret['q_RQ'] = q_RQ # Run RedQueen. RQ_dfs = [] RQ_events = [] for idx in range(test_batches): # Deliberately using eval_sim_opts.s, as it was used to calculate q_RQ. # It seems to be initialized to constant (equal significance). opt = OM.Opt(src_id=eval_sim_opts.src_id, s=eval_sim_opts.s, seed=init_seed + idx, q=q_RQ) mgr = eval_sim_opts.update({ 'q': q_RQ }).create_manager_with_broadcaster(opt) # mgr = eval_sim_opts.update({}).create_manager_with_opt(seed=init_seed + idx) mgr.state.time = window_start mgr.run_dynamic(max_events=MAX_EVENTS) RQ_dfs.append(mgr.get_state().get_dataframe()) RQ_events.append(opt.state.events) if algo_feed: # Figure out what 'q' to use for RQ to get the same number of tweets. # Removing 'RQ_cap_adjust' because RQ systematically tweets more. q_RQ_algo = ES.sweep_q_algo( sim_opts=eval_sim_opts, capacity_cap=capacity_cap - RQ_cap_adjust, algo_feed_args=algo_feed_args, algo_c=algo_c, verbose=verbose, q_init=1000.0, max_events=MAX_EVENTS, max_iters=MAX_ITERS, tol=0.1, only_tol=True, t_min=window_start, ) ret['q_RQ_algo'] = q_RQ_algo # Run RedQueen heuristic. RQ_algo_dfs = [] RQ_algo_events = [] for idx in range(test_batches): # Deliberately not using eval_sim_opts.s, it seems to be initialized to # something strange. opt = ES.OptAlgo(src_id=eval_sim_opts.src_id, seed=init_seed + idx, q=q_RQ_algo, algo_feed_args=algo_feed_args, algo_c=algo_c) mgr = eval_sim_opts.update({ 'q': q_RQ_algo }).create_manager_with_broadcaster(opt) # mgr = eval_sim_opts.update({}).create_manager_with_opt(seed=init_seed + idx) mgr.state.time = window_start mgr.run_dynamic(max_events=MAX_EVENTS) RQ_algo_dfs.append(mgr.get_state().get_dataframe()) RQ_algo_events.append(opt.state.events) # Run Poisson. poisson_dfs = [] poisson_events = [] rate = capacity_cap / (eval_sim_opts.end_time - window_start) for idx in range(test_batches): poisson = OM.Poisson2(src_id=eval_sim_opts.src_id, seed=init_seed + idx, rate=rate) mgr = eval_sim_opts.create_manager_with_broadcaster(poisson) mgr.state.time = window_start mgr.run_dynamic(max_events=MAX_EVENTS) poisson_dfs.append(mgr.get_state().get_dataframe()) poisson_events.append(mgr.get_state().events) # Running Karimi T = eval_sim_opts.end_time - window_start num_segments = 10 seg_len = T / num_segments wall_mgr = eval_sim_opts.create_manager_for_wall() wall_mgr.run_dynamic(max_events=MAX_EVENTS) wall_df = wall_mgr.state.get_dataframe() ret['num_segments'] = num_segments ret['num_wall_tweets'] = wall_df.event_id.nunique() seg_idx = ((wall_df.t.values - window_start) / T * num_segments).astype(int) intensity_df = (wall_df.groupby( ['sink_id', pd.Series(seg_idx, name='segment')]).size() / (T / num_segments)).reset_index(name='intensity') wall_intensities_df = intensity_df.pivot_table( values='intensity', index='sink_id', columns='segment').fillna(0) for seg_idx in range(num_segments): if seg_idx not in wall_intensities_df.columns: wall_intensities_df[seg_idx] = 0.0 wall_intensities = wall_intensities_df[list( range(num_segments))].values # This is the single-threaded version params = (init_seed, capacity_cap, num_segments, eval_sim_opts, wall_intensities, None) op = OR.worker_kdd(params, verbose=verbose, Ks=[K], window_start=window_start) karimi_dfs = [] karimi_events = [] for idx in range(test_batches): piecewise = OM.PiecewiseConst( src_id=eval_sim_opts.src_id, seed=init_seed * 2 + idx, change_times=window_start + np.arange(num_segments) * seg_len, rates=op['kdd_opt_{}'.format(K)] / seg_len) piecewise_const_mgr = eval_sim_opts.create_manager_with_broadcaster( piecewise) piecewise_const_mgr.state.time = window_start piecewise_const_mgr.run_dynamic(max_events=MAX_EVENTS) df = piecewise_const_mgr.state.get_dataframe() karimi_dfs.append(df) karimi_events.append(piecewise_const_mgr.get_state().events) # Calculating metrics if only_rl: all_settings = [('RL', rl_dfs)] else: all_settings = [('RL', rl_dfs), ('RQ', RQ_dfs), ('poisson', poisson_dfs), ('karimi', karimi_dfs)] if algo_feed: all_settings += [('RQ_algo', RQ_algo_dfs)] metric_name = 'num_tweets' for type, dfs in all_settings: metric = [ RU.num_tweets_of(df, broadcaster_id=eval_sim_opts.src_id) for df in dfs ] ret[type + '_' + metric_name + '_mean'], ret[type + '_' + metric_name + '_std'] = (np.mean(metric), np.std(metric)) metric_name = 'top_k' for type, dfs in all_settings: metric = [ RU.time_in_top_k(df, K=K, sim_opts=eval_sim_opts) for df in dfs ] ret[type + '_' + metric_name + '_mean'], ret[type + '_' + metric_name + '_std'] = (np.mean(metric), np.std(metric)) metric_name = 'avg_rank' for type, dfs in all_settings: metric = [RU.average_rank(df, sim_opts=eval_sim_opts) for df in dfs] ret[type + '_' + metric_name + '_mean'], ret[type + '_' + metric_name + '_std'] = (np.mean(metric), np.std(metric)) metric_name = 'r_2' for type, dfs in all_settings: metric = [RU.int_r_2(df, sim_opts=eval_sim_opts) for df in dfs] ret[type + '_' + metric_name + '_mean'], ret[type + '_' + metric_name + '_std'] = (np.mean(metric), np.std(metric)) if algo_feed: if only_rl: all_settings = [('RL', rl_events)] else: all_settings = [('RL', rl_events), ('RQ', RQ_events), ('poisson', poisson_events), ('karimi', karimi_events)] if algo_feed: all_settings += [('RQ_algo', RQ_algo_events)] for type, all_events in all_settings: r_2_algo = [] r_algo = [] top_k_algo = [] for events in all_events: # Calculate some metrics here itself. times, r_2 = ES.algo_true_rank(sink_ids=sink_ids, src_id=eval_sim_opts.src_id, events=events, start_time=window_start, end_time=eval_sim_opts.end_time, steps=REWARD_STEPS, all_prefs=algo_feed_args, square=True, c=algo_c) r_2_algo.append(np.sum(r_2) * (times[1] - times[0])) times, ranks = ES.algo_true_rank( sink_ids=sink_ids, src_id=eval_sim_opts.src_id, events=events, start_time=window_start, end_time=eval_sim_opts.end_time, steps=REWARD_STEPS, all_prefs=algo_feed_args, square=False, c=algo_c) r_algo.append(np.sum(ranks) * (times[1] - times[0])) times, top_ks = ES.algo_top_k(sink_ids=sink_ids, src_id=eval_sim_opts.src_id, events=events, start_time=window_start, end_time=eval_sim_opts.end_time, K=K, steps=REWARD_STEPS, all_prefs=algo_feed_args, c=algo_c) top_k_algo.append(np.sum(top_ks) * (times[1] - times[0])) ret[type + '_r_2_algo_mean'] = np.mean(r_2_algo) ret[type + '_r_2_algo_std'] = np.std(r_2_algo) ret[type + '_avg_rank_algo_mean'] = np.mean(r_algo) ret[type + '_avg_rank_algo_std'] = np.std(r_algo) ret[type + '_top_k_algo_mean'] = np.mean(top_k_algo) ret[type + '_top_k_algo_std'] = np.std(top_k_algo) return ret
save_dir = data_dir + 'training_result/compare_result/' filename = data_dir + 'info/Info.npy' info = np.load(filename, allow_pickle=True) sids = info[:, 0] savefile = data_dir + 'tfAnalysis/ERSD_activation.npy' ersd_corr = np.load(savefile, allow_pickle=True).item( ) # ersd_corr: dict with key=sid. ersd['sid'].shape=(2,channel number) training_result_dir = data_dir + 'training_result/' model_name = ['eegnet', 'shallowFBCSPnet', 'deepnet', 'deepnet_da', 'resnet'] decoding_accuracy = [] results_path = realsorted([ str(pth) for pth in Path(training_result_dir + 'deepLearning/').iterdir() if 'DS_Store' not in str(pth) and 'pdf' not in str(pth) ]) # if pth.suffix == '.npy'] for i, modeli in enumerate(model_name): decoding_accuracy.append([]) for path in results_path: path = str(path) result_file = path + '/training_result_' + modeli + '.npy' result = np.load(result_file, allow_pickle=True).item() train_losses = result['train_losses'] train_accs = result['train_accs'] val_accs = result['val_accs'] test_acc = result['test_acc'] # BUG if modeli == 'deepnet_da':