def findAnagrams(self, s, p): """ :type s: str :type p: str :rtype: List[int] looking for signature. """ if not s: return [] from collections import Counter as cc result = [] p_len = len(p) pc = cc(p) tmp_cc = cc() for i in range(len(s)): # (s[i: i + p_len]) tmp_cc.update([s[i]]) if i >= p_len - 1: if pc == tmp_cc: result += (i - p_len + 1), tmp_cc.subtract([s[i - p_len + 1]]) return result
def check_pts(self): pts = [ x[1] for x in sorted([ a for b in [[(kx, k) for kx in self.vals[k]] for k in self.vals.keys()] for a in b ]) ] vals = [ x[0] for x in sorted([ a for b in [[(kx, k) for kx in self.vals[k]] for k in self.vals.keys()] for a in b ]) ] aS = float(len(pts)) if vals[len(pts) - self.size[self.top]] == 0: pres = len([v for v in vals if v > 0]) tS, bS, aS = pres, 1 - pres, float(len(pts)) else: tS, bS, aS = self.size[self.top], self.size[self.bottom], float( len(pts)) tPts = pts[len(pts) - tS::] bPts = pts[0:bS] tEH, tEL = int((tS / aS) * tS), int((tS / aS) * bS) bEH, bEL = int((bS / aS) * tS), int((bS / aS) * bS) tHO, tLO = cc(tPts)[self.top], cc(bPts)[self.top] bHO, bLO = cc(bPts)[self.bottom], cc(bPts)[self.bottom] hi_chi = chisquare([tHO, tS - tHO], f_exp=[tEH, tS - tEH])[1] lo_chi = chisquare([bLO, bS - bLO], f_exp=[bEL, bS - bEL])[1] return tHO - tEH, bLO - bEL, hi_chi, lo_chi
def bin_chi(self, vals, r, maxSize): both = sorted(cc(vals + r).items()) bins, span, sT = [], [0], 0 for v, c in both: span.append(v) sT += c if sT >= maxSize: bins.append((span[0], span[-1])) span, sT = [v + 1], 0 if span[0] != span[-1]: bins.append((span[0], span[-1])) n = 0 rC, rK, rCnts = sorted(cc(r).items()), 0, [1 for b in bins] vC, vK, vCnts = sorted(cc(vals).items()), 0, [0 for b in bins] while n < len(bins): while rK < len(rC) and rC[rK][0] < bins[n][0]: rK += 1 while rK < len(rC) and rC[rK][0] <= bins[n][1]: rCnts[n] += rC[rK][1] rK += 1 while vK < len(vC) and vC[vK][0] < bins[n][0]: vK += 1 while vK < len(vC) and vC[vK][0] <= bins[n][1]: vCnts[n] += vC[vK][1] vK += 1 n += 1 chiT, chiP = stats.chisquare(vCnts, f_exp=rCnts) return round(chiT, 4), chiP
def create_groups(self, test_num=19): self.group_cutoff = 0.1 ### FIRST YOU GOTTA DO A SAMPLE PRUNING ### ---- ### THEN YOU CAN DO A FEATURE PRUNING ### self.km = [KMeans(n_clusters=p) for p in range(1, test_num)] self.run = [self.km[p].fit(self.vals) for p in range(1, len(self.km))] k_centers, k_labels = [k.cluster_centers_ for k in self.run ], [k.labels_ for k in self.run] k_inertia = [k.inertia_ for k in self.run] for i, centers, labels in zip(range(len(self.run)), k_centers, k_labels): center_key, val_key, sample_to_center, center_loc, center_samples = dd( list), dd(lambda: dd(list)), dd(lambda: {}), {}, dd(list) cLen = i + 2.0 if max(sorted(cc(labels).values(), reverse=True)[1::]) / (len(labels) / cLen) < 0.05: continue # print max(sorted(cc(labels).values(),reverse=True)[1::])/(len(labels)/cLen),i+2 # print len(labels)/cLen,len(labels),cc(labels).values() for k, c in enumerate(centers): center_loc[k] = c for k, c in enumerate(centers): center_key[k] = sorted([(dist(centers[m], c), m) for m in range(len(centers)) if m != k]) for j in range(len(self.vals)): center_samples[labels[j]].append(j) sample_to_center[j] = { k: dist(self.vals[j], c) for k, c in enumerate(centers) } for c, S in center_samples.items(): s_dists = [(dist(self.vals[j], center_loc[c]), j) for j in S] # print sorted([x[0] for x in s_dists],reverse=True)[0:5] # print i,len(centers),centers # print 'to',len(s_dists) for c, S in center_samples.items(): if self.SAMPLES: sIDs = [(self.key[self.samples[s]], self.samples[s]) for s in S] sTypes = sorted([(x[1], x[0]) for x in cc([s[0] for s in sIDs]).items()], reverse=True) print i + 2.0, c, len( S), sTypes[0][1], sTypes[0][0] / float(len(S)) else: sIDs = [(self.data.feats[s], s) for s in S] print i + 2.0, c, len(S), sIDs # ideas are to do condensation based distanaces, and do set parameters for the grouping, we need a min size rate for distance clusters, min support (number close) and no outliers, and consistency sys.exit()
def summarize(self, parent, Y): self.summaries = {} vals, key, lens, opts, fracs = self.vals[parent], self.key[ parent], self.lens[parent], self.opts[parent], self.fracs[parent] yKey = {k: [Y[i] for i in key[k]] for k in key} yMeans = {k: np.mean(v) for k, v in yKey.items()} yObs = { k: len([vi for vi in v if vi > 0]) / float(len(v)) for k, v in yKey.items() } yObsMean = sorted([(np.mean(v), len([vi for vi in v if vi > 0]) / float(len(v)), k) for k, v in yKey.items()]) yMix = sorted([ a for b in [[(v, k) for v in yKey[k]] for k in yKey.keys()] for a in b ]) yZero = [ym for ym in yMix if ym[0] == 0] yNZ = [ym for ym in yMix if ym[0] > 0] yRanks = yZero + [(i + 1, yNZ[i][1]) for i in range(len(yNZ))] yChis = { p: [yRanks[i][0] for i in range(len(yRanks)) if yRanks[i][1] == p] for p in yKey.keys() } chiSort = sorted([(np.mean(yChis[p]), p) for p in yChis.keys()]) low_len, hi_len = max(lens[chiSort[0][1]], len(yZero)), lens[chiSort[-1][1]] lowExp, hiExp = [fracs[k] * low_len for k in yChis.keys() ], [fracs[k] * hi_len for k in yChis.keys()] lowCC = cc([yRanks[i][1] for i in range(0, low_len)]) hiCC = cc( [yRanks[i][1] for i in range(len(yRanks) - hi_len, len(yRanks))]) lowObs, hiObs = [lowCC[k] for k in yChis.keys() ], [hiCC[k] for k in yChis.keys()] lowChi, hiChi = chisquare(lowObs, f_exp=lowExp)[1], chisquare(hiObs, f_exp=hiExp)[1] if self.type[parent] == 'binary': VS = VariableSummarize('binary').add_data(yObsMean, lowChi, hiChi) #VS = VariableSummary('binary') #.add_data(yObsMean,chiSort,(lowChi,hiChi)) #VS.add_data(sorted(yMeans.items(),key=lambda X:X[1]),sorted(yObs.items(),key=lambda X: X[1]),chiSort,(lowChi,hiChi)) else: VS = VariableSummarize('contiinuous').add_data( yObsMean, lowChi, hiChi) #VS = VariableSummary('continuous') #VS.add_data(sorted(yMeans.items(),key=lambda X:X[1]),sorted(yObs.items(),key=lambda X: X[1]),chiSort,(lowChi,hiChi)) return VS
def test_default_main(): from collections import Counter as cc sio = stdoutcapture(["-n", tdir + "diff1.xlsx", tdir + "diff2.xlsx"]) assert (cc(x.replace('"', '').split("\t")[1] for x in sio) == cc({ "replace": 3, "insert": 2, "delete": 1 }))
def binary_labels(self, attribute, label='color', rank=None): sAll, sValid = [s.attributes[attribute] for s in self._list], [ s.attributes[attribute] for s in self._list if s.attributes[attribute] != 'NA' ] if len(set(sValid)) > 0: try: minSize = min(self.min_group_size, sorted(cc(sValid).values())[-2]) except IndexError: minSize = 10 sGrps, sIdx, sMissing = list(set(sValid)), [], [] if len(sGrps) > 2: sGrps = sorted( [a for (a, b) in cc(sValid).items() if b > minSize], reverse=True) if len(sGrps) > self.max_group_members: sGrps = sGrps[0:self.max_group_members] for s in self._list: if s.attributes[attribute] in sGrps: sIdx.append(sGrps.index(s.attributes[attribute])) else: sIdx.append(len(sGrps)) sMissing.append(s.attributes[attribute]) if label == 'color': label_list, missing_label = get_color_list( len(sGrps), len(sMissing) != 0, rank, OFFSET=self.color_offset) self.color_offset += len(label_list) elif label == 'marker': label_list, missing_label = get_marker_list( len(sGrps), len(sMissing) != 0, rank) if len(sMissing) > 0: sGrps.append( attribute + '=' + ",".join([sM.split('~')[-1] for sM in list(set(sMissing))])) label_list.append(missing_label) label_vals, label_labels = [label_list[i] for i in sIdx ], [sGrps[i] for i in sIdx] return label_vals, label_labels
def partitionLabels(self, S): """ :type S: str :rtype: List[int] Input: S = "ababcbaca defegde hijhklij" """ from collections import Counter as cc from __builtin__ import xrange current_set = set() cs = cc(S) result = [] last_idx = 0 for i in xrange(len(S)): current_set.add(S[i]) cs[S[i]] -= 1 if cs[S[i]] == 0: current_set.remove(S[i]) if not current_set: result.append(i - last_idx + 1) last_idx = i + 1 return result
def draw_multi_group(self,multi_group): names, locs = [mg[0] for mg in multi_group],[mg[-1] for mg in multi_group] gc = cc([a for b in [self.group_key[n] for n in names] for a in b]) centroid = np.mean([mg[-1][0] for mg in multi_group]),np.mean([mg[-1][1] for mg in multi_group]) cluster_pts = self.make_gene_circle(5.0,len(names),origin=centroid) shared_genes = [g for g in gc if gc[g] > 1] for pt in cluster_pts: group = names[sorted([(np.linalg.norm(np.array(pt)-np.array(multi_group[i][-1])),i) for i in range(len(multi_group)) if multi_group[i][0] not in self.group_locs])[0][1]] self.draw_group(pt,group) uG = [g for g in self.group_key[group] if self.obs[g] == 1] fG = [g for g in self.group_key[group] if self.obs[g] > 1 and gc[g] == 1] if len(uG) > 1: uniq_pts = self.make_gene_circle(10,len(fG),origin=pt,start_value = math.atan(pt[1]/pt[0])-(math.pi/4)) for p,g in zip(uniq_pts,uG): self.draw_gene(p,g,CENTER=pt) if len(fG) > 1: far_pts = self.make_gene_circle(30,len(fG),origin=pt,start_value = math.atan(pt[1]/pt[0])-(math.pi/6)) for p,g in zip(far_pts,fG): self.draw_gene(p,g,CENTER=pt) if self.DUPES: cluster_uniq, cluster_multi = [g for g in shared_genes if self.obs[g] == gc[g]] , [g for g in shared_genes if self.obs[g] > gc[g]] else: cluster_uniq, cluster_multi = [g for g in shared_genes if g not in self.primary_locations and self.obs[g] == gc[g]] , [g for g in shared_genes if g not in self.primary_locations and self.obs[g] > gc[g]] if len(cluster_uniq) > 0: uniq_pts = self.make_gene_circle(20,len(cluster_uniq),origin=centroid,start_value = math.atan(centroid[0]/centroid[1])-(math.pi/2)) for p,g in zip(uniq_pts,cluster_uniq): self.draw_gene(p,g,CENTER=centroid) if len(cluster_multi) > 0: far_pts = self.make_gene_circle(40,len(cluster_multi),origin=centroid,start_value = math.atan(centroid[0]/centroid[1])-(math.pi/5)) for p,g in zip(far_pts,cluster_multi): self.draw_gene(p,g,CENTER=centroid) return
def score_ids(self,id_list,gene,my_cv): TOPNAME = False for i,ID in enumerate(self.IDS): vt = cc([x[i] for x in id_list]) scores = sorted([(cv/float(self.exp_ids[ID][cx]),cx) for cx,cv in vt.items() if cv >10],reverse=True) if len(scores) == 0: self.scr_key[gene].append((ID,'NA',1,1)) continue else: topScr,topName,nextScr,nextName = scores[0][0],scores[0][1],0.1,'None' if len(scores) > 1: nextScr,nextName = scores[1][0],scores[1][1] self.wRes.write('%s %s %s %s %s | %s %s %s\n' % (gene,ID,topName,topScr,topScr/nextScr,nextName,nextScr,my_cv)) self.scr_key[gene].append((ID,topName,round(topScr,3),round(topScr/nextScr,3))) if ID == 'CTX': if topScr > 1.5: TOPNAME = topName return TOPNAME
def add_hist(self, h_data): x, y = h_data.keys(), sorted(h_data.values()) if len(self.ax.hist(y, bins='auto')[0]) < 2: self.ax.clear() self.ax.hist(np.hstack(y), bins=min(4, len(cc(y)))) yMin, yLim = self.ax.get_ylim() xMin, xLim = self.ax.get_xlim() HI, LO = False, False if yMin == 0: h_srt = sorted(y) out_bool, out_scrs, out_colors = mad_based_outlier(np.array(h_srt)) g_color = out_colors[sorted([ (out_scrs[oi], oi) for oi in range(len(out_scrs)) ])[0][1]] for i, h in enumerate(h_srt): if not out_bool[i]: self.ax.scatter(h, yLim * 1.025, alpha=0.7, color=g_color, clip_on=False) else: self.ax.scatter(h, yLim * 1.025, alpha=0.7, color=out_colors[i], clip_on=False) self.ax.set_ylim(yMin, yLim) self.ax.set_xlim(xMin, xLim) return self
def evaluate_markers(self, outstr): out = open(outstr, 'w') # out = sys.stdout out.write( '%-30s %25s %10s | %13s %13s %13s | %10s\n' % ('---', 'INTEREST', 'OBS', 'obsN', 'totN', 'enrich', 'stat-data')) for (feature, counts) in zip(self.dex.features, self.dex.counts): for interest in self.results.keys(): if self.dex.types[interest] != 'binary': continue #for k in self.results[interest][feature]: #for k in self.results[interest][feature]: #for k in range(1): scores, cnts = self.results[interest][feature]['ttest'] if min([s[1] for s in scores]) > 0.01: continue if max([s[2] for s in scores]) < 2.0: continue my_cnts = [] for c in cnts.keys(): my_cnts.extend([(s, c) for s in cnts[c][-1]]) my_cnts.sort(reverse=True) my_cc, my_len = cc([m[1] for m in my_cnts]), len(my_cnts) total = sum(my_cc.values()) tots, exp, obs, n, enrich = {x: y for x, y in my_cc.items()}, { x: y / float(total) for x, y in my_cc.items() }, dd(float), 0.0, dd(list) for i, (mS, m) in enumerate(my_cnts): if mS == 0: break n += 1.0 obs[m] += 1.0 if i > 10 and i % 3 == 0: for a, aX in obs.items(): ech = (aX / n) / exp[a] if ech > 1.25: if len(enrich[a]) < 3 and ech > 1.25: enrich[a].append([ech, aX, n]) elif len(enrich[a] ) >= 3 and ech > enrich[a][-1][0]: enrich[a].append([ech, aX, n]) if len(enrich.keys()) == 0: continue for (e, eX) in enrich.items(): if obs[e] > tots[e] / 3.0: my_scrs = [s for s in scores if e in s[0]] out.write('%-30s %25s %10.3f |' % (feature, e, obs[e] / float(tots[e]))) out.write(' %13s %13s %13.3f |' % (eX[-1][1], eX[-1][2], eX[-1][0])) #print feature,e,round(obs[e] / float(tots[e]),3) #print '|',eX[-1][1],eX[-1][2],round(eX[-1][0],3), for (a, b), pv, fc in my_scrs: out.write(' %10s %5.2e %5.2f' % (a + ',' + b, pv, fc)) out.write('\n')
def check_halfs(self): keys = self.vals.keys() vals = [ x[0] for x in sorted([ a for b in [[(kx, k) for kx in self.vals[k]] for k in self.vals.keys()] for a in b ]) ] pres = len([v for v in vals if v > 0]) / float(len(vals)) aS = float(len(vals)) if pres < 0.04: return 1.0, 1.0 if pres > 0.4: pres = 0.4 pnot = 1 - pres ec = [int((self.size[k] / aS) * (aS * pres)) for k in keys] enc = [int((self.size[k] / aS) * (aS * pnot)) for k in keys] pts = [ x[1] for x in sorted([ a for b in [[(kx, k) for kx in self.vals[k]] for k in self.vals.keys()] for a in b ], reverse=True) ][0:int(aS * pres)] ptsN = [ x[1] for x in sorted([ a for b in [[(kx, k) for kx in self.vals[k]] for k in self.vals.keys()] for a in b ]) ][0:int(aS * pnot)] pc = cc(pts) obs = [pc[k] for k in keys] pn = cc(ptsN) obsN = [pn[k] for k in keys] P1 = chisquare(obs, f_exp=ec)[1] P2 = chisquare(obsN, f_exp=enc)[1] return P1, P2
def check_neighbors(self,s_order,ID): FULL_RANGE, HALF_RANGE,CHECK_RANGE = self.sample_range,self.sample_range/2,(self.sample_range/2)-1 for i in range(len(s_order)): if i < HALF_RANGE: left=0 right=i+(FULL_RANGE-i)+1 elif i + HALF_RANGE > len(s_order): left = i - (FULL_RANGE - (len(s_order) - i) )-1 right = len(s_order) else: left = i - HALF_RANGE right = 1+i+HALF_RANGE prev = [p[0] for p in s_order[left:i]] post = [p[0] for p in s_order[i+1:right]] preCC,postCC,bothCC = cc(prev), cc(post) , cc(prev+post) preTups = sorted([(x,float(y)/sum(preCC.values())) for x,y in preCC.items()],reverse=True,key=lambda X: X[1]) postTups = sorted([(x,float(y)/sum(postCC.values())) for x,y in postCC.items()],reverse=True,key=lambda X: X[1]) bothTups = sorted([(x,float(y)/sum(bothCC.values())) for x,y in bothCC.items()],reverse=True,key=lambda X: X[1]) topBoth,nextBoth = bothTups[0],('NA',0.01) if len(bothTups)>1: nextBoth = bothTups[1] self.wNay.write('%s %s %s %s ' % (s_order[i][1],i,ID,s_order[i][0])) self.wNay.write('%s %4.4f %3.2f %s %4.4f | ' % (topBoth[0],topBoth[1],topBoth[1]/nextBoth[1],nextBoth[0],nextBoth[1])) topPrev,topPost = ('NA',0.01),('NA',0.01) if len(prev) > CHECK_RANGE: topPrev = preTups[0] if len(post) > CHECK_RANGE: topPost = postTups[0] allTops = sorted([topPrev,topPost,topBoth],reverse=True,key=lambda X: X[1]) self.wNay.write('%s %4.4f \n' % (allTops[0][0],allTops[0][1]))
def simsample_items(L, size=200): L_new, L_now = [], [int(round(10.0 * v, 0)) for v in L] shuffle(L_now) while len(L_now) + len(L_new) < size: L_new.extend(L_now) L_new.extend(random.sample(L_now, size - len(L_new))) Lc = cc(L_new) return [Lc[a] if a in Lc else 0 for a in range(0, 11)]
def rewrite(self, words, k): """ :type words: List[str] :type k: int :rtype: List[str] """ from collections import Counter as cc result = cc(words)
def rewrite(self, graph, initial): """ :type graph: List[List[int]] :type initial: List[int] :rtype: int no rank compress """ from collections import Counter as cc nodes = len(graph) parents = range(nodes) # [0, 1, 2, 3, ... ] SMART! def find(node): if parents[node] != node: return find(parents[node]) return node def union(x, y): fx = find(x) fy = find(y) parents[fx] = fy # build union parents for x in range(nodes): for y in range(x + 1, nodes): # 注意此! 為1才是有conncetion! if graph[x][y] == 1: union(x, y) # print("parents: {}".format(parents)) # dissect nodes forms a quorum # allNodes means: for each node if they form a quorum, will eventually # has the same return node, thus the node count would be the nodes # inside a quorum. allNodes = cc(find(i) for i in range(nodes)) # print("allNodes: {}".format(allNodes)) # initial means the nodes being effected. # with this effected nodes, see are they in the same quorum. # If they are, then will have single node with multiple counts. # If not, if having 2 effected nodes, will have 2 counter key, each # has value 1. # badNodes = cc(find(i) for i in initial) # print("badNodes: {}".format(badNodes)) result = [] # 重點! 以unin的!root! 為 key! 以此key來算count! for bad in initial: key = find(bad) result.append((allNodes[key], -bad)) # print(result) return -max(result)[1]
def numJewelsInStones(self, J, S): """ :type J: str :type S: str :rtype: int """ from collections import Counter as cc cj = cc(J) cs = cc(S) cnt = 0 for k in cs.keys(): if k in cj: cnt += cs[k] return cnt
def add_hist(self, h_data): if type(h_data) == list: x, y = range(len(h_data)), sorted(h_data) else: x, y = h_data.keys(), sorted(h_data.values()) if len(self.ax.hist(y, bins='auto')[0]) < 2: self.ax.clear() self.ax.hist(np.hstack(y), bins=min(4, len(cc(y)))) yMin, yLim = self.ax.get_ylim() xMin, xLim = self.ax.get_xlim() HI, LO = False, False if yMin == 0: h_srt = sorted(y) out_bool, out_scrs, out_colors = mad_based_outlier(np.array(h_srt)) g_color = out_colors[sorted([ (out_scrs[oi], oi) for oi in range(len(out_scrs)) ])[0][1]] if len(h_srt) < 100: s = 10 elif len(h_srt) < 1000: s = 5 else: s = 3 for i, h in enumerate(h_srt): if not out_bool[i]: self.ax.scatter(h, yLim * 1.025, alpha=0.7, color=g_color, s=s, clip_on=False) else: self.ax.scatter(h, yLim * 1.025, alpha=0.7, color=out_colors[i], s=s, clip_on=False) if i > 5 and (out_scrs[i] / (out_scrs[i - 1] + out_scrs[i - 2])) > 0.66: HI = True if HI: try: self.ax.text(h, yLim * 1.030, x[i].name.split(";")[-1]) except AttributeError: continue self.ax.set_ylim(yMin, yLim) self.ax.set_xlim(xMin, xLim) return self
def calculate_chi_enrichment(self, pc_ids, pc_ids2=[]): if len(pc_ids2) == 0: cLen = len(pc_ids) c_cc = cc(pc_ids) c_exp = [cLen * self.prc_rates[k] for k in self.prc_rates] c_obs = [c_cc[k] if k in c_cc else 0 for k in self.prc_rates] chi_over = sorted([ (co - ce, k) for co, ce, k in zip(c_obs, c_exp, self.prc_rates) ])[-1][1] chi_pv = chisquare(c_obs, f_exp=c_exp)[1] return cLen, c_obs, chi_pv, chi_over
def rewrite(self, s): """ :type s: str :rtype: str """ from collections import Counter as cc cs = cc(s) result = "" for str_char, times in cs.most_common(): result += str_char * times return result
def rewrite(self, strs): """ :type strs: List[str] :rtype: List[List[str]] """ from collections import Counter as cc result = dict() for w in strs: c = tuple(sorted(cc(w).items())) if c not in result: result[c] = list() result[c].append(w) return result.values()
def collate_continuous(self): self.cont_res = dd(list) self.bin_res = dd(list) n_keys = ['SPIKES' ] + [k for k in self.data.keys() if len(k.split('-')) == 2] x_keys = [k for k in self.data.keys() if len(k.split('-')) != 2] self.MINCC = 1 self.ST1, self.ST2, self.ST3 = 15, 5, 1 for s in self.samples: s_amps, s_len = self.data['POS_AMPS'][s], len( self.data['POS_AMPS'][s]) ss = EFIZ_STAT(s, s_amps, s_len) for n in n_keys: #if n[0] == 'R': continue ss.set_stat(n, self.data[n][s]) for k, v in ss.key.items(): self.cont_res[k].append(v) for k, v in ss.categorical.items(): self.binary_res[k][s] = v spikes, response, r_key = [ 0 ] + self.data['SPIKES'][s], self.data['RESPONSE'][s], dd(int) hLen, xLen = len([sp for sp in spikes if sp > self.ST2 ]), len([sp for sp in spikes if sp > self.ST3]) for x, y in cc(response).items(): r_key[x] += y if r_key['C'] >= self.MINCC and max( spikes) >= self.ST1 and hLen > 1 and xLen > 2: FS = 'SUSTAINED' elif max(spikes) > 3: FS = 'ACTIVE' elif max(spikes) > 0: FS = 'RESPONSIVE' else: FS = 'ABORTIVE' self.binary_res['FSTYLE'][s] = FS
def mostCommonWord(self, paragraph, banned): """ :type paragraph: str :type banned: List[str] :rtype: str """ from collections import Counter as cc from __builtin__ import unicode import re bp = map(unicode.lower, banned) cp = re.sub(r'[^a-zA-Z]', ' ', paragraph).lower().split() cp_cnt = cc([w for w in cp if w not in bp]) result = cp_cnt.most_common()[0][0] return result
def test_ks(self): self.result = {} self.standard = 4 self.run = [self.km[p].fit(self.vals) for p in range(1,len(self.km))] k_centers, k_labels = [k.cluster_centers_ for k in self.run], [k.labels_ for k in self.run] self.samples = len(k_labels[0]) for i,centers,labels in zip(range(len(self.run)),k_centers,k_labels): center_key, val_key = dd(list), dd(lambda: dd(list)) for k,c in enumerate(centers): center_key[k] = sorted([(dist(centers[m],c),m) for m in range(len(centers)) if m != k]) for j in range(len(self.vals)): val_key[labels[j]][k].append(dist(self.vals[j],c)) if i < 0: continue else: center_stats = {} for myC,altCs in sorted(center_key.items()): mDists, nAvgs, nScrs = val_key[myC][myC], [], [] mArray = [[mD,[]] for mD in mDists] mAvg = np.mean(sorted(mDists)[0:int(1.0+len(mDists)*self.tm)]) for (jScr,jNum) in altCs: jDists = val_key[myC][jNum] for n in range(len(jDists)): mArray[n][-1].append(jDists[n]) jScrs = sorted([pJ/(pJ+pM) for pM,pJ in zip(mDists,jDists)],reverse=True)[0:int(1.0+len(mDists)*self.tm)] jAvg = np.mean(sorted(jDists[0:int(len(mDists)*self.tm)])) if len(jScrs) < 1: nScrs.append(0.0) else: nScrs.append(sum(jScrs)/float(len(jScrs))) nAvgs.append(jAvg) center_stats[myC] = [mAvg,[nAvgs,nScrs],mArray] center_sizes = cc(labels).values() c_min,c_avg,c_max,n_clusts = [], [], [],[] for cent in sorted(center_stats): mAvg,[cAvgs,cScrs],cArray = center_stats[cent] c_dists = [round(cAvg/(mAvg+cAvg),3) for cAvg in cAvgs] c_min.append(min(c_dists)) c_avg.append(round(np.mean(c_dists),3)) c_max.append(max(c_dists)) n_rates = sorted([min([d/(m+0.0000000001) for d in D]) for m,D in cArray]) if len(n_rates) == 0: n_scr = 0.0 else: n_scr = len([n for n in n_rates if n>self.standard])/float(len(n_rates)) n_clusts.append([len(n_rates),round(n_scr,3)]) self.result[i+2] = [round(sum(c_avg)/len(c_avg),3),round(sum(c_min)/len(c_min),3),round(sum(c_max)/len(c_max),3),n_clusts]
def __init__(self, sample_attributes, variable_options, variable_key, predictors, covariates): sample_num = len(sample_attributes.values()[0]) self.group_sizes = {'intercept': sample_num} self.BIN = dd(bool) for v in variable_options: if len(variable_options[v][1]) == 0 or type( variable_options[v][1][0]) == str: self.BIN[v] = True for n, x in cc(sample_attributes[v]).items(): self.group_sizes[n] = x else: self.group_sizes[v] = sample_num self.predictors, self.covariates, self.variables = predictors, covariates, predictors + covariates self.PREDICTOR, self.COVARIATE = dd(bool), dd(bool) self.PREDICTOR['intercept'], self.COVARIATE['intercept'] = True, True self.names = ['intercept'] self.options, self.types, self.inferred = { 'intercept': ['intercept'] }, {}, dd(list) self.vals = {'intercept': [1.0 for s in sample_attributes.values()[0]]} self.key = {'intercept': {1.0: [1.0]}} self.sample_vals = [[1.0] for s in sample_attributes.values()[0]] for opt in variable_options: self.names.append(opt) self.options[opt], self.inferred[opt] = variable_options[opt][ 0], variable_options[opt][1] self.vals[opt] = sample_attributes[opt] for i, v in enumerate(self.vals[opt]): self.sample_vals[i].append(v) self.key[opt] = variable_key[opt] if opt in predictors: self.PREDICTOR[opt] = True else: self.COVARIATE[opt] = True
def rewrite(self, nums, k): """ :type nums: List[int] :type k: int :rtype: int 4 1, 2, 2, 5 要能處理負數! """ from collections import Counter as cc dmap = cc() dmap[0] = 1 summ = 0 cnt = 0 for n in nums: summ += n cnt += dmap[summ - k] dmap[summ] += 1 return cnt
def prepare_variables(self, variables): #sample_idxs = self.prepare_idxs(variables) values, labels, idxs = [], [], [ i for i in range(len(self.samples)) if 'NA' not in [self.variables[v][i] for v in variables] ] for v in variables: if self.types[v] == 'binary': kVals, kCount = [self.variables[v][i] for i in idxs ], cc([self.variables[v][i] for i in idxs]) kPass = [ kn for (kn, kv) in kCount.items() if kv >= min(self.minGroupSize, max(kCount.values())) ] if len(kPass) == len(kCount.keys()): kPass = kPass[1::] kV, kL = [[1 if v == opt else 0 for v in kVals] for opt in kPass], [v + '=' + opt for opt in kPass] else: kV, kL = [[self.variables[v][i] for i in idxs]], [v] values.extend(kV) labels.extend(kL) return values, labels, idxs
def fit_binary_dists(self, CUTOFF=4): for f in self.input.features: vals, logV, dZ = [int(x) for x in f.cnts.values()], [ log(v + 1.0) for v in f.cnts.values() ], [ 0 for i in range(self.input.samples.len - len(f.cnts.values())) ] if len(vals) < CUTOFF: continue if len(vals) > 10: continue val_key = { 'RAW-NZ': vals, 'RAW-WZ': vals + dZ, 'LOG-NZ': logV, 'LOG-WZ': logV + dZ } for val_type, vals in val_key.items(): vLen, v10, vMean = len(vals), int(len(vals) * 0.10), np.mean(vals) if val_type.split('-')[0] == 'LOG': continue else: r = stats.poisson.rvs(vMean, size=len(vals)) both = sorted(cc(vals + r).items()) bins, span, sT = [], [], 0 for v, c in both: span.append(v) sT += c if sT > v10: bins.append((span[0], span[-1])) span, sT = [v + 1], 0 print bins sys.exit()
def select_labels(self, ID, MINSIZE=5, MAXGROUPS=20): id_vals = self.vals[ID] id_opts = list(set(self.vals[ID])) id_cc = sorted(cc(self.vals[ID]).items(), key=lambda X: X[1], reverse=True) p_opts = [ c[0] for i, c in enumerate(id_cc) if (i == 0 or (i < MAXGROUPS and c[1] > MINSIZE)) ] f_opts = [opt for opt in id_opts if opt not in p_opts] if len(f_opts) > 0: p_opts.append('UNAVAIL') v_locs = [ p_opts.index(v) if v in p_opts else p_opts.index('UNAVAIL') for v in id_vals ] return v_locs, p_opts