def findAnagrams(self, s, p):
        """
        :type s: str
        :type p: str
        :rtype: List[int]

        looking for signature.
        """
        if not s:
            return []

        from collections import Counter as cc

        result = []

        p_len = len(p)

        pc = cc(p)
        tmp_cc = cc()

        for i in range(len(s)):
            #  (s[i: i + p_len])

            tmp_cc.update([s[i]])

            if i >= p_len - 1:
                if pc == tmp_cc:
                    result += (i - p_len + 1),

                tmp_cc.subtract([s[i - p_len + 1]])

        return result
Ejemplo n.º 2
0
    def check_pts(self):

        pts = [
            x[1] for x in sorted([
                a for b in [[(kx, k) for kx in self.vals[k]]
                            for k in self.vals.keys()] for a in b
            ])
        ]

        vals = [
            x[0] for x in sorted([
                a for b in [[(kx, k) for kx in self.vals[k]]
                            for k in self.vals.keys()] for a in b
            ])
        ]
        aS = float(len(pts))

        if vals[len(pts) - self.size[self.top]] == 0:
            pres = len([v for v in vals if v > 0])
            tS, bS, aS = pres, 1 - pres, float(len(pts))
        else:
            tS, bS, aS = self.size[self.top], self.size[self.bottom], float(
                len(pts))

        tPts = pts[len(pts) - tS::]
        bPts = pts[0:bS]

        tEH, tEL = int((tS / aS) * tS), int((tS / aS) * bS)
        bEH, bEL = int((bS / aS) * tS), int((bS / aS) * bS)
        tHO, tLO = cc(tPts)[self.top], cc(bPts)[self.top]
        bHO, bLO = cc(bPts)[self.bottom], cc(bPts)[self.bottom]
        hi_chi = chisquare([tHO, tS - tHO], f_exp=[tEH, tS - tEH])[1]
        lo_chi = chisquare([bLO, bS - bLO], f_exp=[bEL, bS - bEL])[1]

        return tHO - tEH, bLO - bEL, hi_chi, lo_chi
Ejemplo n.º 3
0
    def bin_chi(self, vals, r, maxSize):

        both = sorted(cc(vals + r).items())

        bins, span, sT = [], [0], 0
        for v, c in both:
            span.append(v)
            sT += c
            if sT >= maxSize:
                bins.append((span[0], span[-1]))
                span, sT = [v + 1], 0
        if span[0] != span[-1]: bins.append((span[0], span[-1]))

        n = 0
        rC, rK, rCnts = sorted(cc(r).items()), 0, [1 for b in bins]
        vC, vK, vCnts = sorted(cc(vals).items()), 0, [0 for b in bins]

        while n < len(bins):

            while rK < len(rC) and rC[rK][0] < bins[n][0]:
                rK += 1
            while rK < len(rC) and rC[rK][0] <= bins[n][1]:
                rCnts[n] += rC[rK][1]
                rK += 1

            while vK < len(vC) and vC[vK][0] < bins[n][0]:
                vK += 1
            while vK < len(vC) and vC[vK][0] <= bins[n][1]:
                vCnts[n] += vC[vK][1]
                vK += 1

            n += 1
        chiT, chiP = stats.chisquare(vCnts, f_exp=rCnts)
        return round(chiT, 4), chiP
Ejemplo n.º 4
0
    def create_groups(self, test_num=19):

        self.group_cutoff = 0.1

        ### FIRST YOU GOTTA DO A SAMPLE PRUNING ### ---- ### THEN YOU CAN DO A FEATURE PRUNING ###

        self.km = [KMeans(n_clusters=p) for p in range(1, test_num)]
        self.run = [self.km[p].fit(self.vals) for p in range(1, len(self.km))]
        k_centers, k_labels = [k.cluster_centers_ for k in self.run
                               ], [k.labels_ for k in self.run]
        k_inertia = [k.inertia_ for k in self.run]

        for i, centers, labels in zip(range(len(self.run)), k_centers,
                                      k_labels):
            center_key, val_key, sample_to_center, center_loc, center_samples = dd(
                list), dd(lambda: dd(list)), dd(lambda: {}), {}, dd(list)
            cLen = i + 2.0

            if max(sorted(cc(labels).values(),
                          reverse=True)[1::]) / (len(labels) / cLen) < 0.05:
                continue
            #			print max(sorted(cc(labels).values(),reverse=True)[1::])/(len(labels)/cLen),i+2
            #			print len(labels)/cLen,len(labels),cc(labels).values()

            for k, c in enumerate(centers):
                center_loc[k] = c
            for k, c in enumerate(centers):
                center_key[k] = sorted([(dist(centers[m], c), m)
                                        for m in range(len(centers))
                                        if m != k])
            for j in range(len(self.vals)):
                center_samples[labels[j]].append(j)
                sample_to_center[j] = {
                    k: dist(self.vals[j], c)
                    for k, c in enumerate(centers)
                }

            for c, S in center_samples.items():
                s_dists = [(dist(self.vals[j], center_loc[c]), j) for j in S]
#				print sorted([x[0] for x in s_dists],reverse=True)[0:5]
#				print i,len(centers),centers
#				print 'to',len(s_dists)

            for c, S in center_samples.items():
                if self.SAMPLES:
                    sIDs = [(self.key[self.samples[s]], self.samples[s])
                            for s in S]
                    sTypes = sorted([(x[1], x[0])
                                     for x in cc([s[0]
                                                  for s in sIDs]).items()],
                                    reverse=True)
                    print i + 2.0, c, len(
                        S), sTypes[0][1], sTypes[0][0] / float(len(S))
                else:
                    sIDs = [(self.data.feats[s], s) for s in S]
                    print i + 2.0, c, len(S), sIDs

        # ideas are to do condensation based distanaces, and do set parameters for the grouping, we need a min size rate for distance clusters, min support (number close) and no outliers, and consistency

        sys.exit()
Ejemplo n.º 5
0
    def summarize(self, parent, Y):

        self.summaries = {}
        vals, key, lens, opts, fracs = self.vals[parent], self.key[
            parent], self.lens[parent], self.opts[parent], self.fracs[parent]

        yKey = {k: [Y[i] for i in key[k]] for k in key}
        yMeans = {k: np.mean(v) for k, v in yKey.items()}
        yObs = {
            k: len([vi for vi in v if vi > 0]) / float(len(v))
            for k, v in yKey.items()
        }

        yObsMean = sorted([(np.mean(v),
                            len([vi for vi in v if vi > 0]) / float(len(v)), k)
                           for k, v in yKey.items()])

        yMix = sorted([
            a for b in [[(v, k) for v in yKey[k]] for k in yKey.keys()]
            for a in b
        ])
        yZero = [ym for ym in yMix if ym[0] == 0]
        yNZ = [ym for ym in yMix if ym[0] > 0]
        yRanks = yZero + [(i + 1, yNZ[i][1]) for i in range(len(yNZ))]
        yChis = {
            p: [yRanks[i][0] for i in range(len(yRanks)) if yRanks[i][1] == p]
            for p in yKey.keys()
        }
        chiSort = sorted([(np.mean(yChis[p]), p) for p in yChis.keys()])
        low_len, hi_len = max(lens[chiSort[0][1]],
                              len(yZero)), lens[chiSort[-1][1]]
        lowExp, hiExp = [fracs[k] * low_len for k in yChis.keys()
                         ], [fracs[k] * hi_len for k in yChis.keys()]
        lowCC = cc([yRanks[i][1] for i in range(0, low_len)])
        hiCC = cc(
            [yRanks[i][1] for i in range(len(yRanks) - hi_len, len(yRanks))])
        lowObs, hiObs = [lowCC[k] for k in yChis.keys()
                         ], [hiCC[k] for k in yChis.keys()]
        lowChi, hiChi = chisquare(lowObs,
                                  f_exp=lowExp)[1], chisquare(hiObs,
                                                              f_exp=hiExp)[1]

        if self.type[parent] == 'binary':

            VS = VariableSummarize('binary').add_data(yObsMean, lowChi, hiChi)
            #VS = VariableSummary('binary') #.add_data(yObsMean,chiSort,(lowChi,hiChi))

            #VS.add_data(sorted(yMeans.items(),key=lambda X:X[1]),sorted(yObs.items(),key=lambda X: X[1]),chiSort,(lowChi,hiChi))
        else:
            VS = VariableSummarize('contiinuous').add_data(
                yObsMean, lowChi, hiChi)
            #VS = VariableSummary('continuous')
            #VS.add_data(sorted(yMeans.items(),key=lambda X:X[1]),sorted(yObs.items(),key=lambda X: X[1]),chiSort,(lowChi,hiChi))

        return VS
Ejemplo n.º 6
0
 def test_default_main():
     from collections import Counter as cc
     sio = stdoutcapture(["-n", tdir + "diff1.xlsx", tdir + "diff2.xlsx"])
     assert (cc(x.replace('"', '').split("\t")[1] for x in sio) == cc({
         "replace":
         3,
         "insert":
         2,
         "delete":
         1
     }))
Ejemplo n.º 7
0
    def binary_labels(self, attribute, label='color', rank=None):

        sAll, sValid = [s.attributes[attribute] for s in self._list], [
            s.attributes[attribute] for s in self._list
            if s.attributes[attribute] != 'NA'
        ]

        if len(set(sValid)) > 0:
            try:
                minSize = min(self.min_group_size,
                              sorted(cc(sValid).values())[-2])
            except IndexError:
                minSize = 10
            sGrps, sIdx, sMissing = list(set(sValid)), [], []
            if len(sGrps) > 2:
                sGrps = sorted(
                    [a for (a, b) in cc(sValid).items() if b > minSize],
                    reverse=True)
            if len(sGrps) > self.max_group_members:
                sGrps = sGrps[0:self.max_group_members]

            for s in self._list:
                if s.attributes[attribute] in sGrps:
                    sIdx.append(sGrps.index(s.attributes[attribute]))
                else:
                    sIdx.append(len(sGrps))
                    sMissing.append(s.attributes[attribute])
            if label == 'color':
                label_list, missing_label = get_color_list(
                    len(sGrps),
                    len(sMissing) != 0,
                    rank,
                    OFFSET=self.color_offset)
                self.color_offset += len(label_list)

            elif label == 'marker':

                label_list, missing_label = get_marker_list(
                    len(sGrps),
                    len(sMissing) != 0, rank)

            if len(sMissing) > 0:
                sGrps.append(
                    attribute + '=' +
                    ",".join([sM.split('~')[-1]
                              for sM in list(set(sMissing))]))
                label_list.append(missing_label)

            label_vals, label_labels = [label_list[i] for i in sIdx
                                        ], [sGrps[i] for i in sIdx]
            return label_vals, label_labels
Ejemplo n.º 8
0
    def partitionLabels(self, S):
        """
        :type S: str
        :rtype: List[int]

        Input: S = "ababcbaca defegde hijhklij"
        """

        from collections import Counter as cc
        from __builtin__ import xrange

        current_set = set()
        cs = cc(S)
        result = []

        last_idx = 0

        for i in xrange(len(S)):
            current_set.add(S[i])
            cs[S[i]] -= 1

            if cs[S[i]] == 0:
                current_set.remove(S[i])

                if not current_set:
                    result.append(i - last_idx + 1)
                    last_idx = i + 1

        return result
Ejemplo n.º 9
0
	def draw_multi_group(self,multi_group): 

		names, locs = [mg[0] for mg in multi_group],[mg[-1] for mg in multi_group]
		gc = cc([a for b in [self.group_key[n] for n in names] for a in b]) 
		centroid = np.mean([mg[-1][0] for mg in multi_group]),np.mean([mg[-1][1] for mg in multi_group])
		cluster_pts  = self.make_gene_circle(5.0,len(names),origin=centroid) 
		shared_genes = [g for g in gc if gc[g] > 1] 

		for pt in cluster_pts:
			group  = names[sorted([(np.linalg.norm(np.array(pt)-np.array(multi_group[i][-1])),i) for i in range(len(multi_group)) if multi_group[i][0] not in self.group_locs])[0][1]]
			self.draw_group(pt,group) 
			uG  = [g for g in self.group_key[group] if self.obs[g] == 1]
			fG  = [g for g in self.group_key[group] if self.obs[g] > 1 and gc[g] == 1]
			if len(uG) > 1:	
				uniq_pts   =   self.make_gene_circle(10,len(fG),origin=pt,start_value = math.atan(pt[1]/pt[0])-(math.pi/4))
				for p,g in zip(uniq_pts,uG):	self.draw_gene(p,g,CENTER=pt)
			if len(fG) > 1:
				far_pts   =    self.make_gene_circle(30,len(fG),origin=pt,start_value = math.atan(pt[1]/pt[0])-(math.pi/6))
				for p,g in zip(far_pts,fG):	self.draw_gene(p,g,CENTER=pt)

		if self.DUPES: 
			cluster_uniq, cluster_multi = [g for g in shared_genes if self.obs[g] == gc[g]] , [g for g in shared_genes if self.obs[g] > gc[g]] 
		else:
			cluster_uniq, cluster_multi = [g for g in shared_genes if g not in self.primary_locations and self.obs[g] == gc[g]] , [g for g in shared_genes if g not in self.primary_locations and self.obs[g] > gc[g]] 

		if len(cluster_uniq) > 0:
			uniq_pts   =   self.make_gene_circle(20,len(cluster_uniq),origin=centroid,start_value = math.atan(centroid[0]/centroid[1])-(math.pi/2))
			for p,g in zip(uniq_pts,cluster_uniq):	self.draw_gene(p,g,CENTER=centroid)
		if len(cluster_multi) > 0: 
			far_pts   =    self.make_gene_circle(40,len(cluster_multi),origin=centroid,start_value = math.atan(centroid[0]/centroid[1])-(math.pi/5))
			for p,g in zip(far_pts,cluster_multi):	self.draw_gene(p,g,CENTER=centroid)

		return
Ejemplo n.º 10
0
	def score_ids(self,id_list,gene,my_cv): 


		TOPNAME = False 	
		for i,ID in enumerate(self.IDS): 

			vt = cc([x[i] for x in id_list])
			scores = sorted([(cv/float(self.exp_ids[ID][cx]),cx) for cx,cv in vt.items() if cv >10],reverse=True)

			if len(scores) == 0: 
				self.scr_key[gene].append((ID,'NA',1,1)) 
				continue  
			else:
	
				topScr,topName,nextScr,nextName = scores[0][0],scores[0][1],0.1,'None'
				if len(scores) > 1: nextScr,nextName = scores[1][0],scores[1][1] 

				self.wRes.write('%s %s %s %s %s | %s %s %s\n' % (gene,ID,topName,topScr,topScr/nextScr,nextName,nextScr,my_cv))
				self.scr_key[gene].append((ID,topName,round(topScr,3),round(topScr/nextScr,3)))

				if ID == 'CTX': 
					if topScr > 1.5: 
						TOPNAME = topName 

		return TOPNAME 
Ejemplo n.º 11
0
    def add_hist(self, h_data):

        x, y = h_data.keys(), sorted(h_data.values())

        if len(self.ax.hist(y, bins='auto')[0]) < 2:
            self.ax.clear()
            self.ax.hist(np.hstack(y), bins=min(4, len(cc(y))))
        yMin, yLim = self.ax.get_ylim()
        xMin, xLim = self.ax.get_xlim()
        HI, LO = False, False
        if yMin == 0:
            h_srt = sorted(y)
            out_bool, out_scrs, out_colors = mad_based_outlier(np.array(h_srt))
            g_color = out_colors[sorted([
                (out_scrs[oi], oi) for oi in range(len(out_scrs))
            ])[0][1]]
            for i, h in enumerate(h_srt):
                if not out_bool[i]:
                    self.ax.scatter(h,
                                    yLim * 1.025,
                                    alpha=0.7,
                                    color=g_color,
                                    clip_on=False)
                else:
                    self.ax.scatter(h,
                                    yLim * 1.025,
                                    alpha=0.7,
                                    color=out_colors[i],
                                    clip_on=False)
            self.ax.set_ylim(yMin, yLim)
            self.ax.set_xlim(xMin, xLim)
        return self
Ejemplo n.º 12
0
    def evaluate_markers(self, outstr):
        out = open(outstr, 'w')
        #	out = sys.stdout

        out.write(
            '%-30s %25s %10s | %13s %13s %13s | %10s\n' %
            ('---', 'INTEREST', 'OBS', 'obsN', 'totN', 'enrich', 'stat-data'))
        for (feature, counts) in zip(self.dex.features, self.dex.counts):
            for interest in self.results.keys():
                if self.dex.types[interest] != 'binary': continue
                #for k in self.results[interest][feature]:
                #for k in self.results[interest][feature]:
                #for k in range(1):
                scores, cnts = self.results[interest][feature]['ttest']
                if min([s[1] for s in scores]) > 0.01: continue
                if max([s[2] for s in scores]) < 2.0: continue
                my_cnts = []
                for c in cnts.keys():
                    my_cnts.extend([(s, c) for s in cnts[c][-1]])
                my_cnts.sort(reverse=True)
                my_cc, my_len = cc([m[1] for m in my_cnts]), len(my_cnts)
                total = sum(my_cc.values())

                tots, exp, obs, n, enrich = {x: y
                                             for x, y in my_cc.items()}, {
                                                 x: y / float(total)
                                                 for x, y in my_cc.items()
                                             }, dd(float), 0.0, dd(list)

                for i, (mS, m) in enumerate(my_cnts):
                    if mS == 0: break
                    n += 1.0
                    obs[m] += 1.0
                    if i > 10 and i % 3 == 0:
                        for a, aX in obs.items():
                            ech = (aX / n) / exp[a]
                            if ech > 1.25:
                                if len(enrich[a]) < 3 and ech > 1.25:
                                    enrich[a].append([ech, aX, n])
                                elif len(enrich[a]
                                         ) >= 3 and ech > enrich[a][-1][0]:
                                    enrich[a].append([ech, aX, n])

                if len(enrich.keys()) == 0: continue
                for (e, eX) in enrich.items():
                    if obs[e] > tots[e] / 3.0:
                        my_scrs = [s for s in scores if e in s[0]]
                        out.write('%-30s %25s %10.3f |' %
                                  (feature, e, obs[e] / float(tots[e])))
                        out.write(' %13s %13s %13.3f |' %
                                  (eX[-1][1], eX[-1][2], eX[-1][0]))
                        #print feature,e,round(obs[e] / float(tots[e]),3)
                        #print '|',eX[-1][1],eX[-1][2],round(eX[-1][0],3),
                        for (a, b), pv, fc in my_scrs:

                            out.write(' %10s %5.2e  %5.2f' %
                                      (a + ',' + b, pv, fc))
                        out.write('\n')
Ejemplo n.º 13
0
    def check_halfs(self):

        keys = self.vals.keys()
        vals = [
            x[0] for x in sorted([
                a for b in [[(kx, k) for kx in self.vals[k]]
                            for k in self.vals.keys()] for a in b
            ])
        ]
        pres = len([v for v in vals if v > 0]) / float(len(vals))
        aS = float(len(vals))

        if pres < 0.04: return 1.0, 1.0

        if pres > 0.4: pres = 0.4

        pnot = 1 - pres

        ec = [int((self.size[k] / aS) * (aS * pres)) for k in keys]
        enc = [int((self.size[k] / aS) * (aS * pnot)) for k in keys]

        pts = [
            x[1] for x in sorted([
                a for b in [[(kx, k) for kx in self.vals[k]]
                            for k in self.vals.keys()] for a in b
            ],
                                 reverse=True)
        ][0:int(aS * pres)]
        ptsN = [
            x[1] for x in sorted([
                a for b in [[(kx, k) for kx in self.vals[k]]
                            for k in self.vals.keys()] for a in b
            ])
        ][0:int(aS * pnot)]

        pc = cc(pts)
        obs = [pc[k] for k in keys]

        pn = cc(ptsN)
        obsN = [pn[k] for k in keys]

        P1 = chisquare(obs, f_exp=ec)[1]
        P2 = chisquare(obsN, f_exp=enc)[1]

        return P1, P2
Ejemplo n.º 14
0
	def check_neighbors(self,s_order,ID):


		FULL_RANGE, HALF_RANGE,CHECK_RANGE = self.sample_range,self.sample_range/2,(self.sample_range/2)-1
		for i in range(len(s_order)): 

			if i < HALF_RANGE: 
				left=0 
				right=i+(FULL_RANGE-i)+1 

			

			elif i + HALF_RANGE > len(s_order):
				left = i - (FULL_RANGE - (len(s_order) - i) )-1 
				right = len(s_order) 

			else: 
				left = i - HALF_RANGE 
				right = 1+i+HALF_RANGE


			prev = [p[0] for p in s_order[left:i]]
			post = [p[0] for p in s_order[i+1:right]] 

			preCC,postCC,bothCC = cc(prev), cc(post) , cc(prev+post) 

			
			preTups = sorted([(x,float(y)/sum(preCC.values())) for x,y in preCC.items()],reverse=True,key=lambda X: X[1]) 
			postTups = sorted([(x,float(y)/sum(postCC.values())) for x,y in postCC.items()],reverse=True,key=lambda X: X[1]) 
			bothTups = sorted([(x,float(y)/sum(bothCC.values())) for x,y in bothCC.items()],reverse=True,key=lambda X: X[1]) 


			topBoth,nextBoth = bothTups[0],('NA',0.01)
			if len(bothTups)>1: nextBoth = bothTups[1] 



			self.wNay.write('%s %s %s %s ' % (s_order[i][1],i,ID,s_order[i][0]))
			self.wNay.write('%s %4.4f %3.2f %s %4.4f | ' % (topBoth[0],topBoth[1],topBoth[1]/nextBoth[1],nextBoth[0],nextBoth[1]))

			topPrev,topPost = ('NA',0.01),('NA',0.01) 
			if len(prev) > CHECK_RANGE: topPrev = preTups[0]
			if len(post) > CHECK_RANGE: topPost = postTups[0]
			allTops = sorted([topPrev,topPost,topBoth],reverse=True,key=lambda X: X[1]) 
			self.wNay.write('%s %4.4f \n' % (allTops[0][0],allTops[0][1]))
Ejemplo n.º 15
0
def simsample_items(L, size=200):

    L_new, L_now = [], [int(round(10.0 * v, 0)) for v in L]
    shuffle(L_now)
    while len(L_now) + len(L_new) < size:
        L_new.extend(L_now)
    L_new.extend(random.sample(L_now, size - len(L_new)))
    Lc = cc(L_new)
    return [Lc[a] if a in Lc else 0 for a in range(0, 11)]
Ejemplo n.º 16
0
    def rewrite(self, words, k):
        """
        :type words: List[str]
        :type k: int
        :rtype: List[str]
        """
        from collections import Counter as cc

        result = cc(words)
Ejemplo n.º 17
0
    def rewrite(self, graph, initial):
        """
        :type graph: List[List[int]]
        :type initial: List[int]
        :rtype: int

        no rank compress
        """
        from collections import Counter as cc

        nodes = len(graph)
        parents = range(nodes)  # [0, 1, 2, 3, ... ] SMART!

        def find(node):
            if parents[node] != node:
                return find(parents[node])
            return node

        def union(x, y):
            fx = find(x)
            fy = find(y)
            parents[fx] = fy

        # build union parents
        for x in range(nodes):
            for y in range(x + 1, nodes):
                # 注意此! 為1才是有conncetion!
                if graph[x][y] == 1:
                    union(x, y)

        #  print("parents: {}".format(parents))

        # dissect nodes forms a quorum
        # allNodes means: for each node if they form a quorum, will eventually
        # has the same return node, thus the node count would be the nodes
        # inside a quorum.
        allNodes = cc(find(i) for i in range(nodes))
        #  print("allNodes: {}".format(allNodes))

        # initial means the nodes being effected.
        # with this effected nodes, see are they in the same quorum.
        # If they are, then will have single node with multiple counts.
        # If not, if having 2 effected nodes, will have 2 counter key, each
        # has value 1.
        #  badNodes = cc(find(i) for i in initial)
        #  print("badNodes: {}".format(badNodes))

        result = []
        # 重點! 以unin的!root! 為 key! 以此key來算count!

        for bad in initial:
            key = find(bad)
            result.append((allNodes[key], -bad))

        #  print(result)

        return -max(result)[1]
Ejemplo n.º 18
0
    def numJewelsInStones(self, J, S):
        """
        :type J: str
        :type S: str
        :rtype: int
        """

        from collections import Counter as cc

        cj = cc(J)
        cs = cc(S)

        cnt = 0

        for k in cs.keys():
            if k in cj:
                cnt += cs[k]

        return cnt
Ejemplo n.º 19
0
    def add_hist(self, h_data):

        if type(h_data) == list:
            x, y = range(len(h_data)), sorted(h_data)
        else:

            x, y = h_data.keys(), sorted(h_data.values())

        if len(self.ax.hist(y, bins='auto')[0]) < 2:
            self.ax.clear()
            self.ax.hist(np.hstack(y), bins=min(4, len(cc(y))))
        yMin, yLim = self.ax.get_ylim()
        xMin, xLim = self.ax.get_xlim()
        HI, LO = False, False

        if yMin == 0:

            h_srt = sorted(y)
            out_bool, out_scrs, out_colors = mad_based_outlier(np.array(h_srt))
            g_color = out_colors[sorted([
                (out_scrs[oi], oi) for oi in range(len(out_scrs))
            ])[0][1]]

            if len(h_srt) < 100: s = 10
            elif len(h_srt) < 1000: s = 5
            else: s = 3

            for i, h in enumerate(h_srt):
                if not out_bool[i]:
                    self.ax.scatter(h,
                                    yLim * 1.025,
                                    alpha=0.7,
                                    color=g_color,
                                    s=s,
                                    clip_on=False)
                else:
                    self.ax.scatter(h,
                                    yLim * 1.025,
                                    alpha=0.7,
                                    color=out_colors[i],
                                    s=s,
                                    clip_on=False)
                    if i > 5 and (out_scrs[i] /
                                  (out_scrs[i - 1] + out_scrs[i - 2])) > 0.66:
                        HI = True
                    if HI:
                        try:
                            self.ax.text(h, yLim * 1.030,
                                         x[i].name.split(";")[-1])
                        except AttributeError:
                            continue

            self.ax.set_ylim(yMin, yLim)
            self.ax.set_xlim(xMin, xLim)
        return self
Ejemplo n.º 20
0
    def calculate_chi_enrichment(self, pc_ids, pc_ids2=[]):
        if len(pc_ids2) == 0:
            cLen = len(pc_ids)
            c_cc = cc(pc_ids)
            c_exp = [cLen * self.prc_rates[k] for k in self.prc_rates]
            c_obs = [c_cc[k] if k in c_cc else 0 for k in self.prc_rates]

            chi_over = sorted([
                (co - ce, k) for co, ce, k in zip(c_obs, c_exp, self.prc_rates)
            ])[-1][1]

            chi_pv = chisquare(c_obs, f_exp=c_exp)[1]
            return cLen, c_obs, chi_pv, chi_over
    def rewrite(self, s):
        """
        :type s: str
        :rtype: str
        """
        from collections import Counter as cc
        cs = cc(s)

        result = ""

        for str_char, times in cs.most_common():
            result += str_char * times

        return result
Ejemplo n.º 22
0
    def rewrite(self, strs):
        """
        :type strs: List[str]
        :rtype: List[List[str]]
        """
        from collections import Counter as cc
        result = dict()

        for w in strs:
            c = tuple(sorted(cc(w).items()))
            if c not in result:
                result[c] = list()

            result[c].append(w)

        return result.values()
Ejemplo n.º 23
0
    def collate_continuous(self):
        self.cont_res = dd(list)
        self.bin_res = dd(list)
        n_keys = ['SPIKES'
                  ] + [k for k in self.data.keys() if len(k.split('-')) == 2]
        x_keys = [k for k in self.data.keys() if len(k.split('-')) != 2]

        self.MINCC = 1
        self.ST1, self.ST2, self.ST3 = 15, 5, 1

        for s in self.samples:
            s_amps, s_len = self.data['POS_AMPS'][s], len(
                self.data['POS_AMPS'][s])

            ss = EFIZ_STAT(s, s_amps, s_len)
            for n in n_keys:
                #if n[0] == 'R': continue
                ss.set_stat(n, self.data[n][s])
                for k, v in ss.key.items():
                    self.cont_res[k].append(v)

                for k, v in ss.categorical.items():
                    self.binary_res[k][s] = v

            spikes, response, r_key = [
                0
            ] + self.data['SPIKES'][s], self.data['RESPONSE'][s], dd(int)
            hLen, xLen = len([sp for sp in spikes if sp > self.ST2
                              ]), len([sp for sp in spikes if sp > self.ST3])

            for x, y in cc(response).items():
                r_key[x] += y
            if r_key['C'] >= self.MINCC and max(
                    spikes) >= self.ST1 and hLen > 1 and xLen > 2:
                FS = 'SUSTAINED'
            elif max(spikes) > 3:
                FS = 'ACTIVE'
            elif max(spikes) > 0:
                FS = 'RESPONSIVE'
            else:
                FS = 'ABORTIVE'

            self.binary_res['FSTYLE'][s] = FS
Ejemplo n.º 24
0
    def mostCommonWord(self, paragraph, banned):
        """
        :type paragraph: str
        :type banned: List[str]
        :rtype: str
        """

        from collections import Counter as cc
        from __builtin__ import unicode
        import re

        bp = map(unicode.lower, banned)

        cp = re.sub(r'[^a-zA-Z]', ' ', paragraph).lower().split()

        cp_cnt = cc([w for w in cp if w not in bp])

        result = cp_cnt.most_common()[0][0]
        return result
Ejemplo n.º 25
0
	def test_ks(self):
		self.result = {} 
		self.standard = 4
		self.run = [self.km[p].fit(self.vals) for p in range(1,len(self.km))]	
		k_centers, k_labels = [k.cluster_centers_ for k in self.run], [k.labels_ for k in self.run]
		self.samples = len(k_labels[0]) 
 
		for i,centers,labels in zip(range(len(self.run)),k_centers,k_labels):
			center_key, val_key = dd(list), dd(lambda: dd(list))
			for k,c in enumerate(centers):
				center_key[k] = sorted([(dist(centers[m],c),m) for m in range(len(centers)) if m != k])
				for j in range(len(self.vals)):
					val_key[labels[j]][k].append(dist(self.vals[j],c))
			if i < 0:	continue  
			else:
				center_stats = {} 
				for myC,altCs in sorted(center_key.items()):
					mDists, nAvgs, nScrs  = val_key[myC][myC], [], []
					mArray = [[mD,[]] for mD in mDists]
					mAvg = np.mean(sorted(mDists)[0:int(1.0+len(mDists)*self.tm)])
					for (jScr,jNum) in altCs:
						jDists = val_key[myC][jNum]
						for n in range(len(jDists)): mArray[n][-1].append(jDists[n])
						jScrs   = sorted([pJ/(pJ+pM) for pM,pJ in zip(mDists,jDists)],reverse=True)[0:int(1.0+len(mDists)*self.tm)]
						jAvg = np.mean(sorted(jDists[0:int(len(mDists)*self.tm)]))
						if len(jScrs) < 1: nScrs.append(0.0)
						else:		   nScrs.append(sum(jScrs)/float(len(jScrs)))
						nAvgs.append(jAvg)  	
					center_stats[myC] = [mAvg,[nAvgs,nScrs],mArray]
				center_sizes = cc(labels).values() 
				c_min,c_avg,c_max,n_clusts = [], [], [],[]  
				for cent in sorted(center_stats):
					mAvg,[cAvgs,cScrs],cArray = center_stats[cent]
					c_dists = [round(cAvg/(mAvg+cAvg),3) for cAvg in cAvgs]
					c_min.append(min(c_dists))
					c_avg.append(round(np.mean(c_dists),3))
					c_max.append(max(c_dists))	
					n_rates = sorted([min([d/(m+0.0000000001) for d in D]) for m,D in cArray])
					if len(n_rates) == 0:   n_scr = 0.0 
					else: 			n_scr = len([n for n in n_rates if n>self.standard])/float(len(n_rates))
					n_clusts.append([len(n_rates),round(n_scr,3)])
				self.result[i+2] = [round(sum(c_avg)/len(c_avg),3),round(sum(c_min)/len(c_min),3),round(sum(c_max)/len(c_max),3),n_clusts]
Ejemplo n.º 26
0
    def __init__(self, sample_attributes, variable_options, variable_key,
                 predictors, covariates):

        sample_num = len(sample_attributes.values()[0])
        self.group_sizes = {'intercept': sample_num}
        self.BIN = dd(bool)

        for v in variable_options:
            if len(variable_options[v][1]) == 0 or type(
                    variable_options[v][1][0]) == str:
                self.BIN[v] = True
                for n, x in cc(sample_attributes[v]).items():
                    self.group_sizes[n] = x

            else:
                self.group_sizes[v] = sample_num

        self.predictors, self.covariates, self.variables = predictors, covariates, predictors + covariates
        self.PREDICTOR, self.COVARIATE = dd(bool), dd(bool)
        self.PREDICTOR['intercept'], self.COVARIATE['intercept'] = True, True
        self.names = ['intercept']
        self.options, self.types, self.inferred = {
            'intercept': ['intercept']
        }, {}, dd(list)

        self.vals = {'intercept': [1.0 for s in sample_attributes.values()[0]]}
        self.key = {'intercept': {1.0: [1.0]}}
        self.sample_vals = [[1.0] for s in sample_attributes.values()[0]]

        for opt in variable_options:

            self.names.append(opt)
            self.options[opt], self.inferred[opt] = variable_options[opt][
                0], variable_options[opt][1]
            self.vals[opt] = sample_attributes[opt]
            for i, v in enumerate(self.vals[opt]):
                self.sample_vals[i].append(v)
            self.key[opt] = variable_key[opt]
            if opt in predictors: self.PREDICTOR[opt] = True
            else: self.COVARIATE[opt] = True
Ejemplo n.º 27
0
    def rewrite(self, nums, k):
        """
        :type nums: List[int]
        :type k: int
        :rtype: int
        4
        1, 2, 2, 5
        要能處理負數!
        """
        from collections import Counter as cc
        dmap = cc()
        dmap[0] = 1

        summ = 0
        cnt = 0

        for n in nums:
            summ += n
            cnt += dmap[summ - k]
            dmap[summ] += 1

        return cnt
Ejemplo n.º 28
0
 def prepare_variables(self, variables):
     #sample_idxs = self.prepare_idxs(variables)
     values, labels, idxs = [], [], [
         i for i in range(len(self.samples))
         if 'NA' not in [self.variables[v][i] for v in variables]
     ]
     for v in variables:
         if self.types[v] == 'binary':
             kVals, kCount = [self.variables[v][i] for i in idxs
                              ], cc([self.variables[v][i] for i in idxs])
             kPass = [
                 kn for (kn, kv) in kCount.items()
                 if kv >= min(self.minGroupSize, max(kCount.values()))
             ]
             if len(kPass) == len(kCount.keys()): kPass = kPass[1::]
             kV, kL = [[1 if v == opt else 0 for v in kVals]
                       for opt in kPass], [v + '=' + opt for opt in kPass]
         else:
             kV, kL = [[self.variables[v][i] for i in idxs]], [v]
         values.extend(kV)
         labels.extend(kL)
     return values, labels, idxs
Ejemplo n.º 29
0
    def fit_binary_dists(self, CUTOFF=4):

        for f in self.input.features:
            vals, logV, dZ = [int(x) for x in f.cnts.values()], [
                log(v + 1.0) for v in f.cnts.values()
            ], [
                0 for i in range(self.input.samples.len - len(f.cnts.values()))
            ]
            if len(vals) < CUTOFF: continue
            if len(vals) > 10: continue

            val_key = {
                'RAW-NZ': vals,
                'RAW-WZ': vals + dZ,
                'LOG-NZ': logV,
                'LOG-WZ': logV + dZ
            }

            for val_type, vals in val_key.items():

                vLen, v10, vMean = len(vals), int(len(vals) *
                                                  0.10), np.mean(vals)
                if val_type.split('-')[0] == 'LOG': continue
                else:
                    r = stats.poisson.rvs(vMean, size=len(vals))
                    both = sorted(cc(vals + r).items())

                    bins, span, sT = [], [], 0
                    for v, c in both:
                        span.append(v)
                        sT += c
                        if sT > v10:
                            bins.append((span[0], span[-1]))
                            span, sT = [v + 1], 0

                    print bins

        sys.exit()
Ejemplo n.º 30
0
    def select_labels(self, ID, MINSIZE=5, MAXGROUPS=20):

        id_vals = self.vals[ID]
        id_opts = list(set(self.vals[ID]))
        id_cc = sorted(cc(self.vals[ID]).items(),
                       key=lambda X: X[1],
                       reverse=True)

        p_opts = [
            c[0] for i, c in enumerate(id_cc)
            if (i == 0 or (i < MAXGROUPS and c[1] > MINSIZE))
        ]

        f_opts = [opt for opt in id_opts if opt not in p_opts]

        if len(f_opts) > 0:
            p_opts.append('UNAVAIL')

        v_locs = [
            p_opts.index(v) if v in p_opts else p_opts.index('UNAVAIL')
            for v in id_vals
        ]

        return v_locs, p_opts