def make_scatter(congress_num): fig = plt.figure(1) arr = hs.make_similarity_array(congress_num) cls, means, steps = mlpy.kmeans(arr, k=2, plus=True) members = hs.members(congress_num) extreme_index1 = members.index(most_least.most_extreme(congress_num, 10)[0]) extreme_index2 = list(arr[extreme_index1]).index(min(arr[extreme_index1])) if members[extreme_index1].split(' ')[1] == 'R': #change color so Democrats are blue and Republicans are red for i in range(len(cls)): if cls[i] == 0: cls[i] = 1 else: cls[i] = 0 plot = plt.scatter(arr[:, extreme_index1], arr[:, extreme_index2], c=cls, alpha=0.75) plt.xlabel("Conservatism (Cosine similarity to most conservative member, " + members[extreme_index1].split(' ')[0] + ")") plt.ylabel("Liberalism (Cosine similarity to most liberal member, " + members[extreme_index2].split(' ')[0] + ")") else: plot = plt.scatter(arr[:, extreme_index2], arr[:, extreme_index1], c=cls, alpha=0.75) plt.ylabel("Liberalism (Cosine similarity to most liberal member, " + members[extreme_index1].split(' ')[0] + ")") plt.xlabel("Conservatism (Cosine similarity to most conservative member, " + members[extreme_index2].split(' ')[0] + ")") return
def RunKMeansMlpy(): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset[0], delimiter=',') # Gather all parameters. if "clusters" in options: clusters = int(options.pop("clusters")) if clusters < 1: Log.Fatal("Invalid number of clusters requested! Must be greater than or " + "equal to 1.") return -1 else: Log.Fatal("Required option: Number of clusters or cluster locations.") return -1 build_opts = {} if "seed" in options: build_opts["seed"] = int(options.pop("seed")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Create the K-Means object and perform K-Means clustering. kmeans = mlpy.kmeans(data, clusters, **build_opts) except Exception as e: return -1 return totalTimer.ElapsedTime()
def main(argv): if (len(argv)==2): # # # # # # # # # # # # # # # # # # # # # # # # # # # # THIS CODE IS FOR CLUSTERING - FEATURE EXTRACTION: # get tf-idf featuers: [files,words, tf, idf, tfidf, dFreq] = getTextFeatures(argv[1]) # print tf idf values: printMostFrequentWords(words, dFreq) if len(files)>0: SM = computeSimilarityMatrix(files,tfidf) nodeNames = [] for f in files: nodeNames.append(os.path.basename(f).replace("http:__arxiv.org_","")) drawGraphFromSM(SM, files, 'graph.png') cls, means, steps = mlpy.kmeans(SM, k=2, plus=False) fig = plt.figure(1) plt.plot(cls) plt.show() # # # # # # # # # # # # # # # # # # # # # # # # # # # # THIS CODE IS FOR FILE CLASSIFICATION: else: if (len(argv)==3): [dictionariesNames, dictionaries, dictionariesWeights] = loadDictionaries(argv[1]) [Labels, LabelsPs] = classifyFile(argv[2], dictionaries, dictionariesWeights, dictionariesNames, 4, 1) for i in range(len(Labels)): print Labels[i] + "\t\t" + str(LabelsPs[i])
def metric(self): totalTimer = Timer() with totalTimer: model = mlpy.kmeans(self.data[0], **self.build_opts) metric = {} metric["runtime"] = totalTimer.ElapsedTime() return metric
def signal_handler(signal, frame): global mtFeaturesMatrix global className mtFeaturesMatrix = numpy.array(mtFeaturesMatrix) cls, means, steps = mlpy.kmeans(mtFeaturesMatrix, k=2, plus=True) plt.plot(cls) plt.show() #numpy.save(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+className,mtFeaturesMatrix) print('You pressed Ctrl+C!') print mtFeaturesMatrix.shape sys.exit(0)
def test1(): np.random.seed(0) mean1, cov1, n1 = [1, 5], [[1,1],[1,2]], 200 # 200 points, mean=(1,5) x1 = np.random.multivariate_normal(mean1, cov1, n1) mean2, cov2, n2 = [2.5, 2.5], [[1,0],[0,1]], 300 # 300 points, mean=(2.5,2.5) x2 = np.random.multivariate_normal(mean2, cov2, n2) mean3, cov3, n3 = [5, 8], [[0.5,0],[0,0.5]], 200 # 200 points, mean=(5,8) x3 = np.random.multivariate_normal(mean3, cov3, n3) x = np.concatenate((x1, x2, x3), axis=0) # concatenate the samples cls, means, steps = mlpy.kmeans(x, k=3, plus=True) print means
def RunKMeansMlpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset, delimiter=',') # Gather all parameters. clusters = re.search('-c (\d+)', options) seed = re.search("-s (\d+)", options) # Now do validation of options. if not clusters: Log.Fatal( "Required option: Number of clusters or cluster locations." ) q.put(-1) return -1 elif int(clusters.group(1)) < 1: Log.Fatal( "Invalid number of clusters requested! Must be greater than or " + "equal to 1.") q.put(-1) return -1 try: with totalTimer: # Create the K-Means object and perform K-Means clustering. if seed: kmeans = mlpy.kmeans(data, int(clusters.group(1)), seed=int(seed.group(1))) else: kmeans = mlpy.kmeans(data, int(clusters.group(1))) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def clustering (depth,matrix, numCL): groupsOfUsers=[] cls, means, steps = mlpy.kmeans(matrix, k=numCL) print cls for i in range(numCL): groupsOfUsers.append(np.zeros((27,27))) for i in range(len(cls)): groupsOfUsers[cls[i]]+=depth[i] for i in range(numCL): np.save("groupOfUsers%02d.npy" % i , groupsOfUsers[i]) with open("groupOfUsers%02d.csv" % i, 'w') as csvfile: writer = csv.writer(csvfile) [writer.writerow(r) for r in groupsOfUsers[i].tolist()]
def RunKMeansMlpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset[0], delimiter=',') # Gather all parameters. clusters = re.search('-c (\d+)', options) seed = re.search("-s (\d+)", options) # Now do validation of options. if not clusters: Log.Fatal("Required option: Number of clusters or cluster locations.") q.put(-1) return -1 elif int(clusters.group(1)) < 1: Log.Fatal("Invalid number of clusters requested! Must be greater than or " + "equal to 1.") q.put(-1) return -1 try: with totalTimer: # Create the K-Means object and perform K-Means clustering. if seed: kmeans = mlpy.kmeans(data, int(clusters.group(1)), seed=int(seed.group(1))) else: kmeans = mlpy.kmeans(data, int(clusters.group(1))) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def main(): train_business_dict = {} train_input.get_business(config.train_business, train_business_dict) train_input.get_checkin(config.train_checkin, train_business_dict) test_business_dict = {} test_input.get_test_business(config.test_business, test_business_dict) test_input.get_test_checkin(config.test_checkin,test_business_dict) kmeas_input = [] for each in train_business_dict: l = train_business_dict[each].get_chink_info_vec() kmeas_input.append(l) ### 用mlpy 对checkin 的数据进行聚类 x = np.concatenate((kmeas_input), axis=0) cls, means, steps = mlpy.kmeans(x, k=100, plus=True) means_star = [ [0,0] for i in range(0,100)] ##计算每个train_business 属于哪个类 for each in train_business_dict: check_vec = train_business_dict[each].checkin_vec near_cluster = -1 min_dist = 100000000 for each_mean in range(0,100): dist = distance(check_vec, means[each_mean]) if dist < min_dist: near_cluster = each_mean min_dist = dist print near_cluster means_star[near_cluster][0] = means_star[near_cluster][0] + train_business_dict[each].stars means_star[near_cluster][1] = means_star[near_cluster][1] + 1 #然后计算每个business k_compute_test_review( config.test_review,train_business_dict,test_business_dict,means,means_star)
def RunKMeansMlpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset[0], delimiter=',') # Gather all parameters. if "clusters" in options: clusters = int(options.pop("clusters")) if clusters < 1: Log.Fatal( "Invalid number of clusters requested! Must be greater than or " + "equal to 1.") q.put(-1) return -1 else: Log.Fatal( "Required option: Number of clusters or cluster locations." ) q.put(-1) return -1 build_opts = {} if "seed" in options: build_opts["seed"] = int(options.pop("seed")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Create the K-Means object and perform K-Means clustering. kmeans = mlpy.kmeans(data, clusters, **build_opts) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def gep(argv): #read gene and ccel info nexp_genes, t_exp = read_texp(sys.argv[1], sys.argv[2]) unip2gene = read_unip(sys.argv[3]) unip2gene_int = read_unip(sys.argv[3]) compl2 = {} ccel_ann = read_ccel(sys.argv[4]) mutations = count_mutations(sys.argv[5]) smutations = count_smutations(sys.argv[6]) cancer_genes = cancerGenes(sys.argv[7]) oncogenesBySite = cancerGenes_site(sys.argv[7]) tsg = [] compl = read_compl(sys.argv[8], unip2gene, compl2) compl2_c1 = {} compl_c1 = read_compl_c1(sys.argv[9], unip2gene, compl2_c1) interactions = read_int(sys.argv[10], unip2gene_int) ppi = read_int(sys.argv[11], unip2gene_int) tsg = read_tsg(sys.argv[13]) essG = essentialGenes(sys.argv[14]) essG = list(set(essG)) driverGenes = essentialGenes(sys.argv[15]) priority = essentialGenes(sys.argv[16]) tfs = readTF(sys.argv[17]) top_compl = essentialGenes(sys.argv[18]) mirnas = read_miRNA2(sys.argv[21]) miRNABySite = read_miRNA(sys.argv[21]) #load challenge data with open('obj/expr.pkl', 'rb') as handle: expr = pickle.load(handle) with open('obj/p2g.pkl', 'rb') as handle: p2g = pickle.load(handle) with open('obj/expr1.pkl', 'rb') as handle: expr1 = pickle.load(handle) with open('obj/expr2b.pkl', 'rb') as handle: expr2b = pickle.load(handle) with open('obj/expr2.pkl', 'rb') as handle: expr2 = pickle.load(handle) with open('obj/cnv.pkl', 'rb') as handle: cnv = pickle.load(handle) with open('obj/cnv_g.pkl', 'rb') as handle: cnv_g = pickle.load(handle) with open('obj/cnv2.pkl', 'rb') as handle: cnv2 = pickle.load(handle) with open('obj/genes.pkl', 'rb') as handle: genes = pickle.load(handle) with open('obj/headers.pkl', 'rb') as handle: headers = pickle.load(handle) with open('obj/headers_t.pkl', 'rb') as handle: headers_t = pickle.load(handle) with open('obj/es.pkl', 'rb') as handle: es = pickle.load(handle) with open('obj/essent2.pkl', 'rb') as handle: essent2 = pickle.load(handle) with open('obj/essent3.pkl', 'rb') as handle: essent3 = pickle.load(handle) with open('obj/geneInEssent.pkl', 'rb') as handle: geneInEssent = pickle.load(handle) with open('obj/mut.pkl', 'rb') as handle: mut = pickle.load(handle) with open('obj/mut2.pkl', 'rb') as handle: mut2 = pickle.load(handle) with open('obj/mut3.pkl', 'rb') as handle: mut3 = pickle.load(handle) with open('obj/mut_test.pkl', 'rb') as handle: mut_test = pickle.load(handle) with open('obj/mut2_test.pkl', 'rb') as handle: mut2_test = pickle.load(handle) with open('obj/mut3_test.pkl', 'rb') as handle: mut3_test = pickle.load(handle) print mut3.keys() print mut3_test.keys() #join ccel data id_type = [] id_site = [] id_site2 = [] id_hist = [] id_hists = [] ccel_type = {} ccel_site = {} ccel_site2 = {} ccel_gender = {} ccel_hist = {} ccel_hists = {} nline = 0 tot_ccel = [] with open(sys.argv[12]) as f: for line in f.readlines(): line = line.replace("\n", "") line = line.replace("\r", "") ccel = line.split('\t') tot_ccel.append(ccel[0]) if not nline == 0: ccel_type[ccel[0]] = ccel[2] ccel_site[ccel[0]] = ccel[3] found = False if ccel[0] in ccel_ann: found = True ccel_gender[ccel[0]] = ccel_ann[ccel[0]][0] ccel_site2[ccel[0]] = ccel_ann[ccel[0]][1] ccel_hist[ccel[0]] = ccel_ann[ccel[0]][2] ccel_hists[ccel[0]] = ccel_ann[ccel[0]][3] if ccel[1] in ccel_ann and not found: found = True ccel_gender[ccel[0]] = ccel_ann[ccel[1]][0] ccel_site2[ccel[0]] = ccel_ann[ccel[1]][1] ccel_hist[ccel[0]] = ccel_ann[ccel[1]][2] ccel_hists[ccel[0]] = ccel_ann[ccel[1]][3] if not found: print "MISSING!" + ccel[0] ccel_gender[ccel[0]] = '' ccel_site2[ccel[0]] = '' ccel_hist[ccel[0]] = '' ccel_hists[ccel[0]] = '' nline += 1 for s in ccel_site: if not (ccel_site[s] in id_site): id_site.append(ccel_site[s]) for s in ccel_type: if not (ccel_type[s] in id_type): id_type.append(ccel_type[s]) for s in ccel_site2: if not (ccel_site2[s] in id_site2): id_site2.append(ccel_site2[s]) for s in ccel_hist: if not (ccel_hist[s] in id_hist): id_hist.append(ccel_hist[s]) for s in ccel_hists: if not (ccel_hists[s] in id_hists): id_hists.append(ccel_hists[s]) mutatedGenes, mutationCCEL, mutationSITE, mutationSITE2, mutationHIST, mutationHISTS = mutations_ccel( sys.argv[5], headers) #vectorize ccel data c_features = {} for c in tot_ccel: if c != "Name" and c != "Description": c_features[c] = [] #row.append(smutations[c]) tissID = 0 for id1, t in enumerate(id_type): if t == str(ccel_type[c]): c_features[c].append(1) #tissID=id1 else: c_features[c].append(0) #row.append(tissID) # siteID=0 for id1, s in enumerate(id_site): if s == str(ccel_site[c]): c_features[c].append(1) #siteID=id1 else: c_features[c].append(0) #row.append(siteID) gender = 0 if ccel_gender[c] == 'M': gender = 1 if ccel_gender[c] == 'F': gender = 2 c_features[c].append(gender) #siteID=0 for id1, s in enumerate(id_site2): if s == str(ccel_site2[c]): c_features[c].append(1) else: c_features[c].append(0) #siteID=id1 #row.append(siteID) #siteID=0 for id1, s in enumerate(id_hist): if s == str(ccel_hist[c]): c_features[c].append(1) else: c_features[c].append(0) #siteID=id1 #row.append(siteID) #siteID=0 for id1, s in enumerate(id_hists): if s == str(ccel_hists[c]): c_features[c].append(1) else: c_features[c].append(0) #siteID=id1 #put data in the right format for lerner intr = 0 tot_sp1 = 0 tot_sp2 = 0 s1_prediction = {} full = [] full_exp = [] full_cnv = [] full_ccel = [] full_s = [] full2 = [] full3 = [] full4 = [] full3_test = [] full4_test = [] full_rank = [] minmax_fe = [] minmax_fc = [] onco_features = [] ccel_features = [] mut_features = [] expression_features = [] for c in headers: if c != "Name" and c != "Description": row = [] row2 = [] drow = {} drow2 = {} #row=row+c_features[c] for ex in genes: #row.append((expr1[c+ex]-exp_avg)/exp_std) row.append(expr1[c + ex]) drow[ex] = expr1[c + ex] for ex in cnv_g: row2.append(float(cnv[c + ex])) drow2[ex] = cnv[c + ex] ''' sorted_x = sorted(drow.iteritems(), key=operator.itemgetter(1)) etop=sorted_x[0:10] ebot=sorted_x[-10:] for e in etop: minmax_fe.append(e[0]) for e in ebot: minmax_fe.append(e[0]) sorted_x = sorted(drow2.iteritems(), key=operator.itemgetter(1)) etop=sorted_x[0:10] ebot=sorted_x[-10:] for e in etop: minmax_fc.append(e[0]) for e in ebot: minmax_fc.append(e[0]) ''' full_exp.append(row) full_cnv.append(row2) full_ccel.append(c_features[c]) mutRow = [] for g in mutatedGenes: if c in mutationCCEL: if g in mutationCCEL[c]: mutRow.append(1) else: mutRow.append(0) else: mutRow.append(0) oncoRow = [] print c for g in cancer_genes: if ccel_site[c] in oncogenesBySite: if g in oncogenesBySite[ccel_site[c]]: oncoRow.append(1) else: oncoRow.append(0) elif ccel_site2[c] in oncogenesBySite: if g in oncogenesBySite[ccel_site2[c]]: oncoRow.append(1) else: oncoRow.append(0) else: oncoRow.append(0) nexp = [] for g in nexp_genes: if ccel_site[c] in t_exp: if g in t_exp[ccel_site[c]]: nexp.append(1) else: nexp.append(0) elif ccel_site2[c] in t_exp: if g in t_exp[ccel_site2[c]]: nexp.append(1) else: nexp.append(0) else: nexp.append(0) miRNA = [] print c for g in mirnas: if ccel_site[c] in miRNABySite: if g in miRNABySite[ccel_site[c]]: print "entro" miRNA.append(1) else: miRNA.append(0) elif ccel_site2[c] in miRNABySite: if g in miRNABySite[ccel_site2[c]]: miRNA.append(1) print "entro2" else: miRNA.append(0) else: miRNA.append(0) onco_features.append(oncoRow) ccel_features.append(c_features[c]) mut_features.append(mutRow) expression_features.append(nexp) full3.append(row + row2 + c_features[c] + oncoRow + nexp) full4.append(row + row2 + c_features[c] + oncoRow + nexp + mut3[c]) full.append(row + row2 + c_features[c]) #print len(row) #full_exp=preprocessing.binarize(np.array(full_exp),threshold=np.mean(full_exp)).tolist() #full_cnv=preprocessing.binarize(np.array(full_cnv),threshold=np.mean(full_cnv)).tolist() full_s = [] full_s2 = [] for c in headers: if c != "Name" and c != "Description": row = [] for ex in genes: row.append(expr1[c + ex]) full_s.append(row) for c in headers_t: if c != "Name" and c != "Description": row = [] for ex in genes: row.append(expr2[c + ex]) full_s2.append(row) print len(full_s) print len(full_s2) #full_s=np.concatenate((np.array(full_s),np.array(full_s2))) full_s = np.array(full_s) print len(full_s[0]) cls, means, steps = kmeans(full_s, k=5, plus=True) clusters = {} for c in range(len(cls)): clusters[c] = [] for i, c in enumerate(cls): clusters[c].append(i) full = means.T full = full.tolist() for c in headers_t: if c != "Name" and c != "Description": row2 = [] row = [] for ex in genes: row.append(expr2[c + ex]) for ex in cnv_g: row2.append(float(cnv2[c + ex])) mutRow = [] for g in mutatedGenes: if c in mutationCCEL: if g in mutationCCEL[c]: mutRow.append(1) else: mutRow.append(0) else: mutRow.append(0) oncoRow = [] print c for g in cancer_genes: if ccel_site[c] in oncogenesBySite: if g in oncogenesBySite[ccel_site[c]]: oncoRow.append(1) else: oncoRow.append(0) elif ccel_site2[c] in oncogenesBySite: if g in oncogenesBySite[ccel_site2[c]]: oncoRow.append(1) else: oncoRow.append(0) else: oncoRow.append(0) nexp = [] for g in nexp_genes: if ccel_site[c] in t_exp: if g in t_exp[ccel_site[c]]: nexp.append(1) else: nexp.append(0) elif ccel_site2[c] in t_exp: if g in t_exp[ccel_site2[c]]: nexp.append(1) else: nexp.append(0) else: nexp.append(0) miRNA = [] print c for g in mirnas: if ccel_site[c] in miRNABySite: if g in miRNABySite[ccel_site[c]]: print "entro" miRNA.append(1) else: miRNA.append(0) elif ccel_site2[c] in miRNABySite: if g in miRNABySite[ccel_site2[c]]: miRNA.append(1) print "entro2" else: miRNA.append(0) else: miRNA.append(0) full3_test.append(row + row2 + c_features[c] + oncoRow + nexp) full4_test.append(row + row2 + c_features[c] + oncoRow + nexp + mut3_test[c]) row2 = row + row2 + c_features[c] full2.append(row2) #xt=np.array(full) #test=np.array(full2) sel = VarianceThreshold(threshold=0) full3 = np.array(full3) full3_test = np.array(full3_test) tmp = np.concatenate((full3, full3_test), axis=0) vs = sel.fit(tmp) full3 = vs.transform(full3) full3 = full3.tolist() full3_test = vs.transform(full3_test) full3_test = full3_test.tolist() full4 = np.array(full4) full4_test = np.array(full4_test) tmp = np.concatenate((full4, full4_test), axis=0) vs = sel.fit(tmp) full4 = vs.transform(full4) full4 = full4.tolist() full4_test = vs.transform(full4_test) full4_test = full4_test.tolist() inExp = 0 ninExp = 0 c_feat2 = [] for c in headers: if c != "Name" and c != "Description": row = list(c_features[c]) c_feat2.append(row) #feature selection strategies features = {} alias = np.array(genes + cnv_g) features['oncogenes'] = [] features['mutations'] = [] features['tsg'] = [] features['essential'] = [] features['control'] = [] features['driver'] = [] #features['top_compl']=[] features['minmax'] = [] features['tf'] = [] features_test = {} features_test['oncogenes'] = [] features_test['mutations'] = [] features_test['tsg'] = [] features_test['essential'] = [] features_test['control'] = [] features_test['driver'] = [] ''' for tf in tfs: tmp=[] for i,c in enumerate(headers): if (i-2)>=0: exp_list=[] for p in tfs[tf]: if (str(c)+str(p)) in expr: exp_list.append(float(expr[str(c)+str(p)])) if (str(c)+str(p)) in cnv: mlpy.kmeans(x, k=3, plus=True) exp_list.append(float(cnv[str(c)+str(p)])) if len(exp_list)>2: tmp.append(np.array(exp_list)) if len(tmp)>1: features[tf]=tmp print str(tf)+": "+str(len(tmp[0])) for cc in top_compl: tmp=[] for i,c in enumerate(headers): if (i-2)>=0: exp_list=[] for p in compl[cc]: if (str(c)+str(p)) in expr: exp_list.append(float(expr[str(c)+str(p)])) if (str(c)+str(p)) in cnv: exp_list.append(float(cnv[str(c)+str(p)])) if len(exp_list)>2: tmp.append(np.array(exp_list)) if len(tmp)>1: features[cc]=tmp for cc in compl: tmp=[] for i,c in enumerate(headers): if (i-2)>=0: exp_list=[] for p in compl[cc]: if (str(c)+str(p)) in expr: exp_list.append(float(expr[str(c)+str(p)])) if (str(c)+str(p)) in cnv: exp_list.append(float(cnv[str(c)+str(p)])) if len(exp_list)>2: tmp.append(np.array(exp_list)) if len(tmp)>1: features[cc]=tmp if g in ppi: for p in ppi[g]: if ((str(c)+str(p)) in expr) and (not (p==g)): exp_list.append(float(expr[str(c)+str(p)])) if (str(c)+str(g) in expr): exp_list.append(float(expr[str(c)+str(g)])) ''' #print ccel_hist.values() #cancer_genes = cancerGenes2(sys.argv[7], ccel_type.values(), ccel_hist.values(), ccel_site.values(), ccel_site2.values()) #print len(cancer_genes) combined_probes = [] combined_cnvs = [] oncoprobes = [] for i, c in enumerate(headers): if (i - 2) >= 0: exp_list = [] cancer_list = [] mutation_list = [] tsg_list = [] control_list = [] ess_list = [] driver_list = [] tf_list = [] ''' for f in hist2: if f in genes: top_list.append(float(expr1[str(c)+str(f)])) if f in cnv_g: top_list.append(float(cnv[str(c)+str(f)])) for cc in top_compl: for p in compl[int(cc)]: if (str(c)+str(p)) in expr: exp_list.append(float(expr[str(c)+str(p)])) if (str(c)+str(p)) in cnv: exp_list.append(float(cnv[str(c)+str(p)])) features['top_compl'].append(exp_list) ''' for ex in genes: #if p2g[ex] in cancer_genes: # exp_list.append(float(expr[str(c)+str(p2g[ex])])) if p2g[ex] in cancer_genes: cancer_list.append(float(expr[str(c) + str(p2g[ex])])) #combined_probes.append(ex) if p2g[ex] in tfs: tf_list.append(float(expr[str(c) + str(p2g[ex])])) #combined_probes.append(ex) if p2g[ex] in mutations: mutation_list.append(float(expr[str(c) + str(p2g[ex])])) #combined_probes.append(ex) if p2g[ex] in tsg: tsg_list.append(float(expr[str(c) + str(p2g[ex])])) #combined_probes.append(ex) if p2g[ex] in essG: ess_list.append(float(expr[str(c) + str(p2g[ex])])) #combined_probes.append(ex) if p2g[ex] in driverGenes: driver_list.append(float(expr[str(c) + str(p2g[ex])])) #combined_probes.append(ex) for ex in cnv_g: if ex in cancer_genes: cancer_list.append(float(cnv[str(c) + str(ex)])) #combined_cnvs.append(ex) if ex in tfs: tf_list.append(float(cnv[str(c) + str(ex)])) #combined_cnvs.append(ex) if ex in mutations: mutation_list.append(float(cnv[str(c) + str(ex)])) #combined_cnvs.append(ex) if ex in tsg: tsg_list.append(float(cnv[str(c) + str(ex)])) #combined_cnvs.append(ex) if ex in essG: ess_list.append(float(cnv[str(c) + str(ex)])) #combined_cnvs.append(ex) if ex in driverGenes: driver_list.append(float(cnv[str(c) + str(ex)])) #combined_cnvs.append(ex) features['oncogenes'].append(cancer_list + c_features[c]) features['mutations'].append(mutation_list) features['tsg'].append(tsg_list) features['essential'].append(ess_list) features['driver'].append(driver_list) features['tf'].append(tf_list) ''' redundants=[] print "redundant" for i1,f1 in enumerate(features['oncogenes'][0]): row=[] print i1 for i2,f2 in enumerate(features['oncogenes'][0]): if i1>i2: row.append(pearsonr(np.array(features['oncogenes'])[:,i1],np.array(features['oncogenes'])[:,i2])[0]) else: row.append(0) redundants.append(row) features['oncogenes2']=[] noRed=[] print "removing..." for i in range(len(redundants),0): if (min(redundants[i])<0.8): noRed.append(i) tmp=np.array() features['oncogenes2']=np.array(features['oncogenes'])[:,noRed] features['oncogenes2'].tolist() print len(features['oncogenes'][0]) print len(features['oncogenes2'][0]) #for i,f in enumerate(features['oncogenes'][0]): #if i in noRed: #np.concatenate(tmp,features['oncogenes print len(features['oncogenes'][0]) cls, means, steps = kmeans(np.array(features['oncogenes']).T, k=3000, plus=True) #print len(cls) #print len(means) #print len(means[0]) features['oncogenes']=means.T features['oncogenes']=features['oncogenes'].tolist() print len(features['oncogenes'][0]) #for i,c in enumerate(headers): # if (i-2)>=0: # features['oncogenes'][i-2]=features['oncogenes'][i-2]+c_features[c] ''' print "features test" ''' combined_probes_test=[] combined_cnvs_test=[] for i,c in enumerate(headers_t): if (i-2)>=0: exp_list=[] cancer_list=[] mutation_list=[] tsg_list=[] control_list=[] ess_list=[] driver_list=[] for ex in genes: #if p2g[ex] in cancer_genes: # exp_list.append(float(expr[str(c)+str(p2g[ex])])) if p2g[ex] in cancer_genes: cancer_list.append(float(expr2b[str(c)+str(p2g[ex])])) combined_probes.append(ex) if p2g[ex] in mutations: mutation_list.append(float(expr2b[str(c)+str(p2g[ex])])) combined_probes_test.append(ex) if p2g[ex] in tsg: tsg_list.append(float(expr2b[str(c)+str(p2g[ex])])) combined_probes_test.append(ex) if p2g[ex] in essG: ess_list.append(float(expr2b[str(c)+str(p2g[ex])])) combined_probes_test.append(ex) if p2g[ex] in driverGenes: driver_list.append(float(expr2b[str(c)+str(p2g[ex])])) combined_probes_test.append(ex) for ex in cnv_g: if ex in cancer_genes: cancer_list.append(float(cnv2[str(c)+str(ex)])) combined_cnvs_test.append(ex) if ex in mutations: mutation_list.append(float(cnv2[str(c)+str(ex)])) combined_cnvs_test.append(ex) if ex in tsg: tsg_list.append(float(cnv2[str(c)+str(ex)])) combined_cnvs_test.append(ex) if ex in essG: ess_list.append(float(cnv2[str(c)+str(ex)])) combined_cnvs_test.append(ex) if ex in driverGenes: driver_list.append(float(cnv2[str(c)+str(ex)])) combined_cnvs_test.append(ex) features_test['oncogenes'].append(cancer_list) features_test['mutations'].append(mutation_list) features_test['tsg'].append(tsg_list) features_test['essential'].append(ess_list) features_test['driver'].append(driver_list) ''' print "f test" ''' for i,c in enumerate(headers): if (i-2)>=0: exp_list=[] for ex in genes: #if p2g[ex] in cancer_genes: # exp_list.append(float(expr[str(c)+str(p2g[ex])])) if p2g[ex] in mutations: exp_list.append(float(expr[str(c)+str(p2g[ex])])) combined_probes.append(ex) for ex in cnv_g: if ex in mutations: exp_list.append(float(cnv[str(c)+str(ex)])) combined_cnvs.append(ex) features['mutations'].append(exp_list) for i,c in enumerate(headers): if (i-2)>=0: exp_list=[] for ex in genes: #if p2g[ex] in cancer_genes: # exp_list.append(float(expr[str(c)+str(p2g[ex])])) if p2g[ex] in tsg: exp_list.append(float(expr[str(c)+str(p2g[ex])])) combined_probes.append(ex) for ex in cnv_g: if ex in tsg: exp_list.append(float(cnv[str(c)+str(ex)])) combined_cnvs.append(ex) features['tsg'].append(exp_list) for i,c in enumerate(headers): if (i-2)>=0: exp_list=[] for ex in genes: #if p2g[ex] in cancer_genes: # exp_list.append(float(expr[str(c)+str(p2g[ex])])) if p2g[ex] in essG: exp_list.append(float(expr[str(c)+str(p2g[ex])])) combined_probes.append(ex) for ex in cnv_g: if ex in essG: exp_list.append(float(cnv[str(c)+str(ex)])) combined_cnvs.append(ex) features['essential'].append(exp_list) #combined features combined_probes=list(set(combined_probes)) combined_cnvs=list(set(combined_cnvs)) features['combined']=[] for i,c in enumerate(headers): if (i-2)>=0: exp_list=[] for ex in combined_probes: exp_list.append(float(expr1[str(c)+str(ex)])) for ex in combined_cnvs: exp_list.append(float(cnv[str(c)+str(ex)])) features['combined'].append(exp_list) combined_probes_test=list(set(combined_probes_test)) combined_cnvs_test=list(set(combined_cnvs_test)) features_test['combined']=[] for i,c in enumerate(headers_t): if (i-2)>=0: exp_list=[] for ex in combined_probes: exp_list.append(float(expr2[str(c)+str(ex)])) for ex in combined_cnvs: exp_list.append(float(cnv2[str(c)+str(ex)])) features_test['combined'].append(exp_list) ''' #print gct header scores = {} for f in features: scores[f] = 0 scoresp = {} for f in features: scoresp[f] = 0 print "features ready" #header for the final submission out_file_f = open("prediction" + str(time.time()) + ".gct", "w") out_file_f.write("#1.2\n") out_file_f.write( str(len(geneInEssent)) + "\t" + str(len(headers_t) - 2) + "\n") for c in headers_t: if c != "Name": out_file_f.write("\t") out_file_f.write(str(c)) out_file_f.write("\n") best = 0 control = 0 avg = 0 rndGenes = random.sample(geneInEssent, 1000) #for g in rndGenes: for g in geneInEssent: #for g in priority: intr += 1 #s1_prediction[g]={} yt = np.array(essent2[g]) current_score = {} current_scorep = {} xt = np.array(full3) test = np.array(full3_test) selector = SelectPercentile(f_regression, percentile=18).fit(xt, yt) xt2 = selector.transform(xt) test = selector.transform(test) eps2 = 3 * np.std(yt) * math.sqrt(math.log(len(yt)) / len(yt)) cc = max(abs(np.mean(yt) + np.std(yt)), abs(np.mean(yt) - np.std(yt))) knn2 = svm.SVR(C=cc, epsilon=eps2) res = knn2.fit(xt2, yt).predict(test) out_file_f.write(g + "\t" + g) for p in res: out_file_f.write("\t" + str(p)) out_file_f.write("\n") out_file_f.flush() #best+=max(current_score) out_file_f.close() return 0
from matplotlib import pyplot as plt from sklearn.datasets import load_iris import mlpy from sklearn.cluster import KMeans #Cargamos los datos y graficamos datos=load_iris() dat=datos.data caract_names=datos.feature_names tar=datos.target #Calculamos los cluster cls, means, steps = mlpy.kmeans(dat, k=3, plus=True) #steps #Esta variable permite conocer los pasos que realizó el algoritmo para terminar #Construimos las gráficas correspondiente plt.subplot(2,1,1) fig = plt.figure(1) fig.suptitle("Ejemplo de k-medias",fontsize=15) plot1 = plt.scatter(dat[:,0], dat[:,1], c=cls, alpha=0.75) #Agregamos las Medias a las gráficas plot2 = plt.scatter(means[:,0], means[:,1],c=[1,2,3], s=128, marker='d') #plt.show()
import numpy as np import mlpy import json import sys NR_CLUSTERS = 5 if len(sys.argv) > 1: try: NR_CLUSTERS = int(sys.argv[1]) except: pass emails = np.genfromtxt('email-features.csv', delimiter=',') cls, means, steps = mlpy.kmeans(emails, k=NR_CLUSTERS, plus=True) print steps, "steps" print len(means), "clusters" emails_file = open("emails_parsed_0.json") emails = json.load(emails_file) f = open("classifications.txt", "w") for i in range(len(emails)): f.write( "========== EMAIL ==========\n" ) f.write("==== CLASS " + str(cls[i]) + "\n") f.write("==== subject: " + emails[i]['subject'] + '\n') f.write("==== email_text: \n" + (emails[i]['email_text']).encode('utf-8') + '\n') f.close()
dirlist = os.listdir(dir) for file in dirlist: mypath = dir + '/' + file info = os.stat(mypath) # don't deal with directories yet if info.st_size > 0: finfo = [file,float(info.st_size)/1024,info.st_mtime,info.st_mode] dinfo += [finfo] # cluster by size infos = [[log(x[1])] for x in dinfo] cls,means,steps = kmeans(infos,k=10,plus=True) ctmp = -1 data = zip([x[0] for x in dinfo],cls,infos) for f,c,s in sorted(data,key=lambda tempkey: tempkey[2]): if c != ctmp: print '###############################' print c,'\t',round(s[0],1),'\t\t',f ctmp = c
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel( "data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel( "data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] # SET 1A # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C # iFeaturesSelect = range(100); # SET 3 # MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = numpy.min(MidTermFeatures[1,:]) # EnergyMean = numpy.mean(MidTermFeatures[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] # print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: # [mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 # for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] # mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T # DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) # MDistancesAll = numpy.mean(DistancesAll) # iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1],)) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin # print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio); clf = LDA(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True) # perform k-means clustering # YDist = distance.pdist(MidTermFeaturesNorm.T, metric='euclidean') # print distance.squareform(YDist).shape # hc = mlpy.HCluster() # hc.linkage(YDist) # cls = hc.cut(14.5) # print cls # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] # get subset of feature vectors Yt = distance.pdist( MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA) silB = numpy.array(silB) sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE # silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters return nSpeakersFinal
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x); Duration = len(x) / Fs [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5)); MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) ) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:,i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001; MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001; MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 0C iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect,:] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)); mtStepRatio = int(round(stWin / stWin)); mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2; #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos<N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) ) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:,i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001; mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001; mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1],)); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = LDA(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers<=0: sRange = range(2,10) else: sRange = [numOfSpeakers] clsAll = []; silAll = []; centersAll = [] for iSpeakers in sRange: cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True) # perform k-means clustering #YDist = distance.pdist(MidTermFeaturesNorm.T, metric='euclidean') #print distance.squareform(YDist).shape #hc = mlpy.HCluster() #hc.linkage(YDist) #cls = hc.cut(14.5) #print cls # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat) # hmm training hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show()
import numpy as np import matplotlib.pyplot as plt import mlpy np.random.seed(0) mean1, cov1, n1 = [1, 5], [[1, 1], [1, 2]], 200 # 200 points, mean=(1,5) x1 = np.random.multivariate_normal(mean1, cov1, n1) mean2, cov2, n2 = [2.5, 2.5], [[1, 0], [0, 1]], 300 # 300 points, mean=(2.5,2.5) x2 = np.random.multivariate_normal(mean2, cov2, n2) mean3, cov3, n3 = [5, 8], [[0.5, 0], [0, 0.5]], 200 # 200 points, mean=(5,8) x3 = np.random.multivariate_normal(mean3, cov3, n3) x = np.concatenate((x1, x2, x3), axis=0) # concatenate the samples cls, means, steps = mlpy.kmeans(x, k=3, plus=True) steps fig = plt.figure(1) plot1 = plt.scatter(x[:, 0], x[:, 1], c=cls, alpha=0.75) plot2 = plt.scatter(means[:, 0], means[:, 1], c=np.unique(cls), s=128, marker='d') # plot the means plt.show()