Example #1
0
def make_scatter(congress_num):
    fig = plt.figure(1)
    arr = hs.make_similarity_array(congress_num)
    cls, means, steps = mlpy.kmeans(arr, k=2, plus=True)
    members = hs.members(congress_num)
    extreme_index1 = members.index(most_least.most_extreme(congress_num, 10)[0])
    extreme_index2 = list(arr[extreme_index1]).index(min(arr[extreme_index1]))
    if members[extreme_index1].split(' ')[1] == 'R':
        #change color so Democrats are blue and Republicans are red
        for i in range(len(cls)):
            if cls[i] == 0:
                cls[i] = 1
            else:
                cls[i] = 0
        plot = plt.scatter(arr[:, extreme_index1], arr[:, extreme_index2], c=cls, alpha=0.75)
        plt.xlabel("Conservatism (Cosine similarity to most conservative member, "
                   + members[extreme_index1].split(' ')[0] + ")")
        plt.ylabel("Liberalism (Cosine similarity to most liberal member, "
                   + members[extreme_index2].split(' ')[0] + ")")
    else:
        plot = plt.scatter(arr[:, extreme_index2], arr[:, extreme_index1], c=cls, alpha=0.75)
        plt.ylabel("Liberalism (Cosine similarity to most liberal member, "
                   + members[extreme_index1].split(' ')[0] + ")")
        plt.xlabel("Conservatism (Cosine similarity to most conservative member, "
                   + members[extreme_index2].split(' ')[0] + ")")
    return
Example #2
0
    def RunKMeansMlpy():
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      data = np.genfromtxt(self.dataset[0], delimiter=',')

      # Gather all parameters.
      if "clusters" in options:
        clusters = int(options.pop("clusters"))

        if clusters < 1:
          Log.Fatal("Invalid number of clusters requested! Must be greater than or "
              + "equal to 1.")
          return -1
      else:
        Log.Fatal("Required option: Number of clusters or cluster locations.")
        return -1

      build_opts = {}
      if "seed" in options:
        build_opts["seed"] = int(options.pop("seed"))

      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      try:
        with totalTimer:
          # Create the K-Means object and perform K-Means clustering.
          kmeans = mlpy.kmeans(data, clusters, **build_opts)
      except Exception as e:
        return -1

      return totalTimer.ElapsedTime()
Example #3
0
def main(argv):
	if (len(argv)==2):
		# # # # # # # # # # # # # # # # # # # # # # # # # # #
		# THIS CODE IS FOR CLUSTERING - FEATURE EXTRACTION:
		# get tf-idf featuers:
		[files,words, tf, idf, tfidf, dFreq]  = getTextFeatures(argv[1])
		# print tf idf values:
		printMostFrequentWords(words, dFreq)
		if len(files)>0:
			SM = computeSimilarityMatrix(files,tfidf)
			nodeNames = []
			for f in files:
				nodeNames.append(os.path.basename(f).replace("http:__arxiv.org_",""))
			drawGraphFromSM(SM, files, 'graph.png')
			cls, means, steps = mlpy.kmeans(SM, k=2, plus=False)
			fig = plt.figure(1)
			plt.plot(cls)
			plt.show()
			


		# # # # # # # # # # # # # # # # # # # # # # # # # # #
		# THIS CODE IS FOR FILE CLASSIFICATION:
	else:
		if (len(argv)==3):			
			[dictionariesNames, dictionaries, dictionariesWeights] = loadDictionaries(argv[1])
			[Labels, LabelsPs] = classifyFile(argv[2], dictionaries, dictionariesWeights, dictionariesNames, 4, 1)
			for i in range(len(Labels)):
				print Labels[i] + "\t\t" + str(LabelsPs[i])
Example #4
0
    def metric(self):
        totalTimer = Timer()
        with totalTimer:
            model = mlpy.kmeans(self.data[0], **self.build_opts)

        metric = {}
        metric["runtime"] = totalTimer.ElapsedTime()
        return metric
Example #5
0
def signal_handler(signal, frame):
    global mtFeaturesMatrix
    global className
    mtFeaturesMatrix = numpy.array(mtFeaturesMatrix)    
    cls, means, steps = mlpy.kmeans(mtFeaturesMatrix, k=2, plus=True)
    plt.plot(cls)
    plt.show()
    #numpy.save(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+className,mtFeaturesMatrix)
    print('You pressed Ctrl+C!')
    print mtFeaturesMatrix.shape
    sys.exit(0)
Example #6
0
def test1():
    np.random.seed(0)
    mean1, cov1, n1 = [1, 5], [[1,1],[1,2]], 200 # 200 points, mean=(1,5)
    x1 = np.random.multivariate_normal(mean1, cov1, n1)
    mean2, cov2, n2 = [2.5, 2.5], [[1,0],[0,1]], 300 # 300 points, mean=(2.5,2.5)
    x2 = np.random.multivariate_normal(mean2, cov2, n2)
    mean3, cov3, n3 = [5, 8], [[0.5,0],[0,0.5]], 200 # 200 points, mean=(5,8)
    x3 = np.random.multivariate_normal(mean3, cov3, n3)
    x = np.concatenate((x1, x2, x3), axis=0) # concatenate the samples
    cls, means, steps = mlpy.kmeans(x, k=3, plus=True)
    print means
Example #7
0
def signal_handler(signal, frame):
    global mtFeaturesMatrix
    global className
    mtFeaturesMatrix = numpy.array(mtFeaturesMatrix)    
    cls, means, steps = mlpy.kmeans(mtFeaturesMatrix, k=2, plus=True)
    plt.plot(cls)
    plt.show()
    #numpy.save(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+className,mtFeaturesMatrix)
    print('You pressed Ctrl+C!')
    print mtFeaturesMatrix.shape
    sys.exit(0)
Example #8
0
        def RunKMeansMlpy(q):
            totalTimer = Timer()

            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            data = np.genfromtxt(self.dataset, delimiter=',')

            # Gather all parameters.
            clusters = re.search('-c (\d+)', options)
            seed = re.search("-s (\d+)", options)

            # Now do validation of options.
            if not clusters:
                Log.Fatal(
                    "Required option: Number of clusters or cluster locations."
                )
                q.put(-1)
                return -1
            elif int(clusters.group(1)) < 1:
                Log.Fatal(
                    "Invalid number of clusters requested! Must be greater than or "
                    + "equal to 1.")
                q.put(-1)
                return -1

            try:
                with totalTimer:
                    # Create the K-Means object and perform K-Means clustering.
                    if seed:
                        kmeans = mlpy.kmeans(data,
                                             int(clusters.group(1)),
                                             seed=int(seed.group(1)))
                    else:
                        kmeans = mlpy.kmeans(data, int(clusters.group(1)))
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Example #9
0
def clustering (depth,matrix, numCL):
  groupsOfUsers=[]
  cls, means, steps = mlpy.kmeans(matrix, k=numCL)
  print cls

  for i in range(numCL):
      groupsOfUsers.append(np.zeros((27,27)))
  for i in range(len(cls)):
        groupsOfUsers[cls[i]]+=depth[i]
  for i in range(numCL):
        np.save("groupOfUsers%02d.npy" % i , groupsOfUsers[i])
        with open("groupOfUsers%02d.csv" % i, 'w') as csvfile:
            writer = csv.writer(csvfile)
            [writer.writerow(r) for r in   groupsOfUsers[i].tolist()]
Example #10
0
    def RunKMeansMlpy(q):
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      data = np.genfromtxt(self.dataset[0], delimiter=',')

      # Gather all parameters.
      clusters = re.search('-c (\d+)', options)
      seed = re.search("-s (\d+)", options)

      # Now do validation of options.
      if not clusters:
        Log.Fatal("Required option: Number of clusters or cluster locations.")
        q.put(-1)
        return -1
      elif int(clusters.group(1)) < 1:
        Log.Fatal("Invalid number of clusters requested! Must be greater than or "
            + "equal to 1.")
        q.put(-1)
        return -1

      try:
        with totalTimer:
          # Create the K-Means object and perform K-Means clustering.
          if seed:
            kmeans = mlpy.kmeans(data, int(clusters.group(1)), seed=int(seed.group(1)))
          else:
            kmeans = mlpy.kmeans(data, int(clusters.group(1)))
      except Exception as e:
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
Example #11
0
def main():
    train_business_dict = {}
    train_input.get_business(config.train_business, train_business_dict)
    
    train_input.get_checkin(config.train_checkin, train_business_dict)
    
    test_business_dict = {}
    test_input.get_test_business(config.test_business, test_business_dict)
    
    test_input.get_test_checkin(config.test_checkin,test_business_dict)
    
    
    
    kmeas_input = []
    for each in train_business_dict:
        
        l = train_business_dict[each].get_chink_info_vec()
        kmeas_input.append(l)
    
    ### 用mlpy 对checkin 的数据进行聚类   
    x = np.concatenate((kmeas_input), axis=0)
    cls, means, steps = mlpy.kmeans(x, k=100, plus=True)
    
    means_star = [ [0,0] for i in range(0,100)]
    
    ##计算每个train_business 属于哪个类
    for each in train_business_dict:
        
        check_vec = train_business_dict[each].checkin_vec
        
        near_cluster = -1
        min_dist = 100000000
        for each_mean in  range(0,100):
            
            dist = distance(check_vec, means[each_mean])
            
            if dist < min_dist:
                near_cluster = each_mean
                min_dist = dist
        print near_cluster
        means_star[near_cluster][0] = means_star[near_cluster][0] + train_business_dict[each].stars
        means_star[near_cluster][1] = means_star[near_cluster][1] + 1
        
    #然后计算每个business
    
    k_compute_test_review( config.test_review,train_business_dict,test_business_dict,means,means_star)
Example #12
0
        def RunKMeansMlpy(q):
            totalTimer = Timer()

            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            data = np.genfromtxt(self.dataset[0], delimiter=',')

            # Gather all parameters.
            if "clusters" in options:
                clusters = int(options.pop("clusters"))

                if clusters < 1:
                    Log.Fatal(
                        "Invalid number of clusters requested! Must be greater than or "
                        + "equal to 1.")
                    q.put(-1)
                    return -1
            else:
                Log.Fatal(
                    "Required option: Number of clusters or cluster locations."
                )
                q.put(-1)
                return -1

            build_opts = {}
            if "seed" in options:
                build_opts["seed"] = int(options.pop("seed"))

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")

            try:
                with totalTimer:
                    # Create the K-Means object and perform K-Means clustering.
                    kmeans = mlpy.kmeans(data, clusters, **build_opts)
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Example #13
0
def gep(argv):
    #read gene and ccel info
    nexp_genes, t_exp = read_texp(sys.argv[1], sys.argv[2])
    unip2gene = read_unip(sys.argv[3])
    unip2gene_int = read_unip(sys.argv[3])
    compl2 = {}
    ccel_ann = read_ccel(sys.argv[4])
    mutations = count_mutations(sys.argv[5])
    smutations = count_smutations(sys.argv[6])
    cancer_genes = cancerGenes(sys.argv[7])
    oncogenesBySite = cancerGenes_site(sys.argv[7])
    tsg = []
    compl = read_compl(sys.argv[8], unip2gene, compl2)
    compl2_c1 = {}
    compl_c1 = read_compl_c1(sys.argv[9], unip2gene, compl2_c1)
    interactions = read_int(sys.argv[10], unip2gene_int)
    ppi = read_int(sys.argv[11], unip2gene_int)
    tsg = read_tsg(sys.argv[13])
    essG = essentialGenes(sys.argv[14])
    essG = list(set(essG))
    driverGenes = essentialGenes(sys.argv[15])
    priority = essentialGenes(sys.argv[16])
    tfs = readTF(sys.argv[17])
    top_compl = essentialGenes(sys.argv[18])
    mirnas = read_miRNA2(sys.argv[21])
    miRNABySite = read_miRNA(sys.argv[21])

    #load challenge data
    with open('obj/expr.pkl', 'rb') as handle:
        expr = pickle.load(handle)
    with open('obj/p2g.pkl', 'rb') as handle:
        p2g = pickle.load(handle)
    with open('obj/expr1.pkl', 'rb') as handle:
        expr1 = pickle.load(handle)
    with open('obj/expr2b.pkl', 'rb') as handle:
        expr2b = pickle.load(handle)
    with open('obj/expr2.pkl', 'rb') as handle:
        expr2 = pickle.load(handle)
    with open('obj/cnv.pkl', 'rb') as handle:
        cnv = pickle.load(handle)
    with open('obj/cnv_g.pkl', 'rb') as handle:
        cnv_g = pickle.load(handle)
    with open('obj/cnv2.pkl', 'rb') as handle:
        cnv2 = pickle.load(handle)
    with open('obj/genes.pkl', 'rb') as handle:
        genes = pickle.load(handle)
    with open('obj/headers.pkl', 'rb') as handle:
        headers = pickle.load(handle)
    with open('obj/headers_t.pkl', 'rb') as handle:
        headers_t = pickle.load(handle)
    with open('obj/es.pkl', 'rb') as handle:
        es = pickle.load(handle)
    with open('obj/essent2.pkl', 'rb') as handle:
        essent2 = pickle.load(handle)
    with open('obj/essent3.pkl', 'rb') as handle:
        essent3 = pickle.load(handle)
    with open('obj/geneInEssent.pkl', 'rb') as handle:
        geneInEssent = pickle.load(handle)

    with open('obj/mut.pkl', 'rb') as handle:
        mut = pickle.load(handle)
    with open('obj/mut2.pkl', 'rb') as handle:
        mut2 = pickle.load(handle)
    with open('obj/mut3.pkl', 'rb') as handle:
        mut3 = pickle.load(handle)
    with open('obj/mut_test.pkl', 'rb') as handle:
        mut_test = pickle.load(handle)
    with open('obj/mut2_test.pkl', 'rb') as handle:
        mut2_test = pickle.load(handle)
    with open('obj/mut3_test.pkl', 'rb') as handle:
        mut3_test = pickle.load(handle)

    print mut3.keys()
    print mut3_test.keys()

    #join ccel data
    id_type = []
    id_site = []
    id_site2 = []
    id_hist = []
    id_hists = []

    ccel_type = {}
    ccel_site = {}
    ccel_site2 = {}
    ccel_gender = {}
    ccel_hist = {}
    ccel_hists = {}
    nline = 0

    tot_ccel = []
    with open(sys.argv[12]) as f:
        for line in f.readlines():
            line = line.replace("\n", "")
            line = line.replace("\r", "")

            ccel = line.split('\t')
            tot_ccel.append(ccel[0])
            if not nline == 0:
                ccel_type[ccel[0]] = ccel[2]
                ccel_site[ccel[0]] = ccel[3]
                found = False
                if ccel[0] in ccel_ann:
                    found = True
                    ccel_gender[ccel[0]] = ccel_ann[ccel[0]][0]
                    ccel_site2[ccel[0]] = ccel_ann[ccel[0]][1]
                    ccel_hist[ccel[0]] = ccel_ann[ccel[0]][2]
                    ccel_hists[ccel[0]] = ccel_ann[ccel[0]][3]
                if ccel[1] in ccel_ann and not found:
                    found = True
                    ccel_gender[ccel[0]] = ccel_ann[ccel[1]][0]
                    ccel_site2[ccel[0]] = ccel_ann[ccel[1]][1]
                    ccel_hist[ccel[0]] = ccel_ann[ccel[1]][2]
                    ccel_hists[ccel[0]] = ccel_ann[ccel[1]][3]
                if not found:
                    print "MISSING!" + ccel[0]
                    ccel_gender[ccel[0]] = ''
                    ccel_site2[ccel[0]] = ''
                    ccel_hist[ccel[0]] = ''
                    ccel_hists[ccel[0]] = ''

            nline += 1
    for s in ccel_site:
        if not (ccel_site[s] in id_site):
            id_site.append(ccel_site[s])

    for s in ccel_type:
        if not (ccel_type[s] in id_type):
            id_type.append(ccel_type[s])

    for s in ccel_site2:
        if not (ccel_site2[s] in id_site2):
            id_site2.append(ccel_site2[s])

    for s in ccel_hist:
        if not (ccel_hist[s] in id_hist):
            id_hist.append(ccel_hist[s])

    for s in ccel_hists:
        if not (ccel_hists[s] in id_hists):
            id_hists.append(ccel_hists[s])

    mutatedGenes, mutationCCEL, mutationSITE, mutationSITE2, mutationHIST, mutationHISTS = mutations_ccel(
        sys.argv[5], headers)

    #vectorize ccel data
    c_features = {}
    for c in tot_ccel:
        if c != "Name" and c != "Description":
            c_features[c] = []
            #row.append(smutations[c])
            tissID = 0
            for id1, t in enumerate(id_type):
                if t == str(ccel_type[c]):
                    c_features[c].append(1)
                    #tissID=id1
                else:
                    c_features[c].append(0)

                #row.append(tissID)
                #     siteID=0
                for id1, s in enumerate(id_site):
                    if s == str(ccel_site[c]):
                        c_features[c].append(1)
                        #siteID=id1
                    else:
                        c_features[c].append(0)
                    #row.append(siteID)
                gender = 0
                if ccel_gender[c] == 'M':
                    gender = 1
                if ccel_gender[c] == 'F':
                    gender = 2
                c_features[c].append(gender)
                #siteID=0

                for id1, s in enumerate(id_site2):
                    if s == str(ccel_site2[c]):
                        c_features[c].append(1)
                    else:
                        c_features[c].append(0)
                        #siteID=id1
                    #row.append(siteID)
                #siteID=0
                for id1, s in enumerate(id_hist):
                    if s == str(ccel_hist[c]):
                        c_features[c].append(1)
                    else:
                        c_features[c].append(0)
                        #siteID=id1
                    #row.append(siteID)
                #siteID=0
                for id1, s in enumerate(id_hists):
                    if s == str(ccel_hists[c]):
                        c_features[c].append(1)
                    else:
                        c_features[c].append(0)
                        #siteID=id1

    #put data in the right format for lerner
    intr = 0
    tot_sp1 = 0
    tot_sp2 = 0
    s1_prediction = {}
    full = []
    full_exp = []
    full_cnv = []
    full_ccel = []
    full_s = []
    full2 = []
    full3 = []
    full4 = []
    full3_test = []
    full4_test = []
    full_rank = []
    minmax_fe = []
    minmax_fc = []
    onco_features = []
    ccel_features = []
    mut_features = []
    expression_features = []
    for c in headers:
        if c != "Name" and c != "Description":
            row = []
            row2 = []
            drow = {}
            drow2 = {}
            #row=row+c_features[c]
            for ex in genes:
                #row.append((expr1[c+ex]-exp_avg)/exp_std)
                row.append(expr1[c + ex])
                drow[ex] = expr1[c + ex]
            for ex in cnv_g:
                row2.append(float(cnv[c + ex]))
                drow2[ex] = cnv[c + ex]
            '''	
           sorted_x = sorted(drow.iteritems(), key=operator.itemgetter(1))
           etop=sorted_x[0:10]
           ebot=sorted_x[-10:]
           for e in etop:
		minmax_fe.append(e[0])
           for e in ebot:
		minmax_fe.append(e[0])
           sorted_x = sorted(drow2.iteritems(), key=operator.itemgetter(1))
           etop=sorted_x[0:10]
           ebot=sorted_x[-10:]
           for e in etop:
		minmax_fc.append(e[0])
           for e in ebot:
		minmax_fc.append(e[0])
           '''
            full_exp.append(row)
            full_cnv.append(row2)
            full_ccel.append(c_features[c])
            mutRow = []
            for g in mutatedGenes:
                if c in mutationCCEL:
                    if g in mutationCCEL[c]:
                        mutRow.append(1)
                    else:
                        mutRow.append(0)
                else:
                    mutRow.append(0)

            oncoRow = []
            print c
            for g in cancer_genes:
                if ccel_site[c] in oncogenesBySite:
                    if g in oncogenesBySite[ccel_site[c]]:
                        oncoRow.append(1)
                    else:
                        oncoRow.append(0)
                elif ccel_site2[c] in oncogenesBySite:
                    if g in oncogenesBySite[ccel_site2[c]]:
                        oncoRow.append(1)
                    else:
                        oncoRow.append(0)
                else:
                    oncoRow.append(0)

            nexp = []
            for g in nexp_genes:
                if ccel_site[c] in t_exp:
                    if g in t_exp[ccel_site[c]]:
                        nexp.append(1)
                    else:
                        nexp.append(0)
                elif ccel_site2[c] in t_exp:
                    if g in t_exp[ccel_site2[c]]:
                        nexp.append(1)
                    else:
                        nexp.append(0)
                else:
                    nexp.append(0)

            miRNA = []
            print c
            for g in mirnas:
                if ccel_site[c] in miRNABySite:
                    if g in miRNABySite[ccel_site[c]]:
                        print "entro"
                        miRNA.append(1)
                    else:
                        miRNA.append(0)
                elif ccel_site2[c] in miRNABySite:
                    if g in miRNABySite[ccel_site2[c]]:
                        miRNA.append(1)
                        print "entro2"
                    else:
                        miRNA.append(0)
                else:
                    miRNA.append(0)

            onco_features.append(oncoRow)
            ccel_features.append(c_features[c])
            mut_features.append(mutRow)
            expression_features.append(nexp)
            full3.append(row + row2 + c_features[c] + oncoRow + nexp)
            full4.append(row + row2 + c_features[c] + oncoRow + nexp + mut3[c])
            full.append(row + row2 + c_features[c])
            #print len(row)

    #full_exp=preprocessing.binarize(np.array(full_exp),threshold=np.mean(full_exp)).tolist()
    #full_cnv=preprocessing.binarize(np.array(full_cnv),threshold=np.mean(full_cnv)).tolist()
    full_s = []
    full_s2 = []
    for c in headers:
        if c != "Name" and c != "Description":
            row = []
            for ex in genes:
                row.append(expr1[c + ex])
            full_s.append(row)
    for c in headers_t:
        if c != "Name" and c != "Description":
            row = []
            for ex in genes:
                row.append(expr2[c + ex])
            full_s2.append(row)
    print len(full_s)
    print len(full_s2)
    #full_s=np.concatenate((np.array(full_s),np.array(full_s2)))
    full_s = np.array(full_s)
    print len(full_s[0])
    cls, means, steps = kmeans(full_s, k=5, plus=True)
    clusters = {}
    for c in range(len(cls)):
        clusters[c] = []
    for i, c in enumerate(cls):
        clusters[c].append(i)
    full = means.T
    full = full.tolist()

    for c in headers_t:
        if c != "Name" and c != "Description":
            row2 = []
            row = []
            for ex in genes:
                row.append(expr2[c + ex])
            for ex in cnv_g:
                row2.append(float(cnv2[c + ex]))

            mutRow = []
            for g in mutatedGenes:
                if c in mutationCCEL:
                    if g in mutationCCEL[c]:
                        mutRow.append(1)
                    else:
                        mutRow.append(0)
                else:
                    mutRow.append(0)

            oncoRow = []
            print c
            for g in cancer_genes:
                if ccel_site[c] in oncogenesBySite:
                    if g in oncogenesBySite[ccel_site[c]]:
                        oncoRow.append(1)
                    else:
                        oncoRow.append(0)
                elif ccel_site2[c] in oncogenesBySite:
                    if g in oncogenesBySite[ccel_site2[c]]:
                        oncoRow.append(1)
                    else:
                        oncoRow.append(0)
                else:
                    oncoRow.append(0)

            nexp = []
            for g in nexp_genes:
                if ccel_site[c] in t_exp:
                    if g in t_exp[ccel_site[c]]:
                        nexp.append(1)
                    else:
                        nexp.append(0)
                elif ccel_site2[c] in t_exp:
                    if g in t_exp[ccel_site2[c]]:
                        nexp.append(1)
                    else:
                        nexp.append(0)
                else:
                    nexp.append(0)

            miRNA = []
            print c
            for g in mirnas:
                if ccel_site[c] in miRNABySite:
                    if g in miRNABySite[ccel_site[c]]:
                        print "entro"
                        miRNA.append(1)
                    else:
                        miRNA.append(0)
                elif ccel_site2[c] in miRNABySite:
                    if g in miRNABySite[ccel_site2[c]]:
                        miRNA.append(1)
                        print "entro2"
                    else:
                        miRNA.append(0)
                else:
                    miRNA.append(0)

            full3_test.append(row + row2 + c_features[c] + oncoRow + nexp)
            full4_test.append(row + row2 + c_features[c] + oncoRow + nexp +
                              mut3_test[c])
            row2 = row + row2 + c_features[c]
            full2.append(row2)
    #xt=np.array(full)
    #test=np.array(full2)

    sel = VarianceThreshold(threshold=0)
    full3 = np.array(full3)
    full3_test = np.array(full3_test)
    tmp = np.concatenate((full3, full3_test), axis=0)
    vs = sel.fit(tmp)
    full3 = vs.transform(full3)
    full3 = full3.tolist()
    full3_test = vs.transform(full3_test)
    full3_test = full3_test.tolist()

    full4 = np.array(full4)
    full4_test = np.array(full4_test)
    tmp = np.concatenate((full4, full4_test), axis=0)
    vs = sel.fit(tmp)
    full4 = vs.transform(full4)
    full4 = full4.tolist()
    full4_test = vs.transform(full4_test)
    full4_test = full4_test.tolist()
    inExp = 0
    ninExp = 0

    c_feat2 = []
    for c in headers:
        if c != "Name" and c != "Description":
            row = list(c_features[c])
            c_feat2.append(row)

    #feature selection strategies
    features = {}
    alias = np.array(genes + cnv_g)
    features['oncogenes'] = []
    features['mutations'] = []
    features['tsg'] = []
    features['essential'] = []
    features['control'] = []
    features['driver'] = []
    #features['top_compl']=[]
    features['minmax'] = []
    features['tf'] = []
    features_test = {}
    features_test['oncogenes'] = []
    features_test['mutations'] = []
    features_test['tsg'] = []
    features_test['essential'] = []
    features_test['control'] = []
    features_test['driver'] = []
    '''
    for tf in tfs:
	tmp=[]
    	for i,c in enumerate(headers):
    		if (i-2)>=0:	
        		exp_list=[]
                      	for p in tfs[tf]:
                          	if (str(c)+str(p)) in expr:
                             		exp_list.append(float(expr[str(c)+str(p)]))
                          	if (str(c)+str(p)) in cnv:
                             	mlpy.kmeans(x, k=3, plus=True)	exp_list.append(float(cnv[str(c)+str(p)]))
			if len(exp_list)>2:
               			tmp.append(np.array(exp_list))
	if len(tmp)>1:
		features[tf]=tmp
		print str(tf)+": "+str(len(tmp[0]))
    for cc in top_compl:
	tmp=[]
    	for i,c in enumerate(headers):
    		if (i-2)>=0:	
        		exp_list=[]
                      	for p in compl[cc]:
                          	if (str(c)+str(p)) in expr:
                             		exp_list.append(float(expr[str(c)+str(p)]))
                          	if (str(c)+str(p)) in cnv:
                             		exp_list.append(float(cnv[str(c)+str(p)]))
			if len(exp_list)>2:
               			tmp.append(np.array(exp_list))
	if len(tmp)>1:
		features[cc]=tmp
    for cc in compl:
	tmp=[]
    	for i,c in enumerate(headers):
    		if (i-2)>=0:	
        		exp_list=[]
                      	for p in compl[cc]:
                          	if (str(c)+str(p)) in expr:
                             		exp_list.append(float(expr[str(c)+str(p)]))
                          	if (str(c)+str(p)) in cnv:
                             		exp_list.append(float(cnv[str(c)+str(p)]))
			if len(exp_list)>2:
               			tmp.append(np.array(exp_list))
	if len(tmp)>1:
		features[cc]=tmp
    if g in ppi:
	for p in ppi[g]:
		if ((str(c)+str(p)) in expr) and (not (p==g)):
         		exp_list.append(float(expr[str(c)+str(p)]))
    if (str(c)+str(g) in expr):
	exp_list.append(float(expr[str(c)+str(g)]))
    '''
    #print ccel_hist.values()
    #cancer_genes = cancerGenes2(sys.argv[7], ccel_type.values(), ccel_hist.values(), ccel_site.values(), ccel_site2.values())
    #print len(cancer_genes)
    combined_probes = []
    combined_cnvs = []
    oncoprobes = []
    for i, c in enumerate(headers):
        if (i - 2) >= 0:
            exp_list = []
            cancer_list = []
            mutation_list = []
            tsg_list = []
            control_list = []
            ess_list = []
            driver_list = []
            tf_list = []
            '''
		for f in hist2:
			if f in genes:
                      		top_list.append(float(expr1[str(c)+str(f)]))
			if f in cnv_g:
                      		top_list.append(float(cnv[str(c)+str(f)]))
		for cc in top_compl:
                      	for p in compl[int(cc)]:
                          	if (str(c)+str(p)) in expr:
                             		exp_list.append(float(expr[str(c)+str(p)]))
                          	if (str(c)+str(p)) in cnv:
                             		exp_list.append(float(cnv[str(c)+str(p)]))
               	features['top_compl'].append(exp_list)	
		'''
            for ex in genes:

                #if p2g[ex] in cancer_genes:
                #    exp_list.append(float(expr[str(c)+str(p2g[ex])]))
                if p2g[ex] in cancer_genes:
                    cancer_list.append(float(expr[str(c) + str(p2g[ex])]))
                    #combined_probes.append(ex)

                if p2g[ex] in tfs:
                    tf_list.append(float(expr[str(c) + str(p2g[ex])]))
                    #combined_probes.append(ex)

                if p2g[ex] in mutations:
                    mutation_list.append(float(expr[str(c) + str(p2g[ex])]))
                    #combined_probes.append(ex)
                if p2g[ex] in tsg:
                    tsg_list.append(float(expr[str(c) + str(p2g[ex])]))
                    #combined_probes.append(ex)
                if p2g[ex] in essG:
                    ess_list.append(float(expr[str(c) + str(p2g[ex])]))
                    #combined_probes.append(ex)
                if p2g[ex] in driverGenes:
                    driver_list.append(float(expr[str(c) + str(p2g[ex])]))
                    #combined_probes.append(ex)

            for ex in cnv_g:
                if ex in cancer_genes:
                    cancer_list.append(float(cnv[str(c) + str(ex)]))
                    #combined_cnvs.append(ex)

                if ex in tfs:
                    tf_list.append(float(cnv[str(c) + str(ex)]))
                    #combined_cnvs.append(ex)

                if ex in mutations:
                    mutation_list.append(float(cnv[str(c) + str(ex)]))
                    #combined_cnvs.append(ex)
                if ex in tsg:
                    tsg_list.append(float(cnv[str(c) + str(ex)]))
                    #combined_cnvs.append(ex)
                if ex in essG:
                    ess_list.append(float(cnv[str(c) + str(ex)]))
                    #combined_cnvs.append(ex)
                if ex in driverGenes:
                    driver_list.append(float(cnv[str(c) + str(ex)]))
                    #combined_cnvs.append(ex)

            features['oncogenes'].append(cancer_list + c_features[c])
            features['mutations'].append(mutation_list)
            features['tsg'].append(tsg_list)
            features['essential'].append(ess_list)
            features['driver'].append(driver_list)
            features['tf'].append(tf_list)
    '''
    redundants=[]
    print "redundant"
    for i1,f1 in enumerate(features['oncogenes'][0]):
	row=[]
    	print i1
	for i2,f2 in enumerate(features['oncogenes'][0]):
		if i1>i2:
			row.append(pearsonr(np.array(features['oncogenes'])[:,i1],np.array(features['oncogenes'])[:,i2])[0])
		else:
			row.append(0)
	redundants.append(row)
    features['oncogenes2']=[]
    noRed=[]
    print "removing..."
    for i in range(len(redundants),0):
	if (min(redundants[i])<0.8):
		noRed.append(i)
    tmp=np.array()

    features['oncogenes2']=np.array(features['oncogenes'])[:,noRed]
    features['oncogenes2'].tolist()
    print len(features['oncogenes'][0])
    print len(features['oncogenes2'][0])
    #for i,f in enumerate(features['oncogenes'][0]):
	#if i in noRed:
		#np.concatenate(tmp,features['oncogenes
    print len(features['oncogenes'][0])
    cls, means, steps = kmeans(np.array(features['oncogenes']).T, k=3000, plus=True)
    #print len(cls)
    #print len(means)
    #print len(means[0])
    features['oncogenes']=means.T	
    features['oncogenes']=features['oncogenes'].tolist()	
    print len(features['oncogenes'][0])

    #for i,c in enumerate(headers):
    #	if (i-2)>=0:	
	#	features['oncogenes'][i-2]=features['oncogenes'][i-2]+c_features[c]
    '''
    print "features test"
    '''
    combined_probes_test=[]
    combined_cnvs_test=[]
    for i,c in enumerate(headers_t):
    	if (i-2)>=0:	
        	exp_list=[]
        	cancer_list=[]
        	mutation_list=[]
        	tsg_list=[]
        	control_list=[]
        	ess_list=[]
        	driver_list=[]
		for ex in genes:

			#if p2g[ex] in cancer_genes:
			#    exp_list.append(float(expr[str(c)+str(p2g[ex])]))	
			if p2g[ex] in cancer_genes:
                      		cancer_list.append(float(expr2b[str(c)+str(p2g[ex])]))
                      		combined_probes.append(ex)

			if p2g[ex] in mutations:
                      		mutation_list.append(float(expr2b[str(c)+str(p2g[ex])]))
                      		combined_probes_test.append(ex)
			if p2g[ex] in tsg:
                      		tsg_list.append(float(expr2b[str(c)+str(p2g[ex])]))
                      		combined_probes_test.append(ex)
			if p2g[ex] in essG:
                      		ess_list.append(float(expr2b[str(c)+str(p2g[ex])]))
                      		combined_probes_test.append(ex)
			if p2g[ex] in driverGenes:
                      		driver_list.append(float(expr2b[str(c)+str(p2g[ex])]))
                      		combined_probes_test.append(ex)

		for ex in cnv_g:
			if ex in cancer_genes:	
                      		cancer_list.append(float(cnv2[str(c)+str(ex)]))	
                      		combined_cnvs_test.append(ex)

			if ex in mutations:	
                      		mutation_list.append(float(cnv2[str(c)+str(ex)]))
                      		combined_cnvs_test.append(ex)
			if ex in tsg:	
                      		tsg_list.append(float(cnv2[str(c)+str(ex)]))	
                      		combined_cnvs_test.append(ex)
			if ex in essG:	
                      		ess_list.append(float(cnv2[str(c)+str(ex)]))	
                      		combined_cnvs_test.append(ex)
			if ex in driverGenes:	
                      		driver_list.append(float(cnv2[str(c)+str(ex)]))	
                      		combined_cnvs_test.append(ex)	

               	features_test['oncogenes'].append(cancer_list)	
               	features_test['mutations'].append(mutation_list)	
               	features_test['tsg'].append(tsg_list)	
               	features_test['essential'].append(ess_list)
               	features_test['driver'].append(driver_list)
    '''
    print "f test"
    '''
    for i,c in enumerate(headers):
    	if (i-2)>=0:	
        	exp_list=[]
		for ex in genes:

			#if p2g[ex] in cancer_genes:
			#    exp_list.append(float(expr[str(c)+str(p2g[ex])]))	
			if p2g[ex] in mutations:
                      		exp_list.append(float(expr[str(c)+str(p2g[ex])]))
                      		combined_probes.append(ex)
		for ex in cnv_g:
			if ex in mutations:	
                      		exp_list.append(float(cnv[str(c)+str(ex)]))
                      		combined_cnvs.append(ex)	
               	features['mutations'].append(exp_list)

    for i,c in enumerate(headers):
    	if (i-2)>=0:	
        	exp_list=[]
		for ex in genes:

			#if p2g[ex] in cancer_genes:
			#    exp_list.append(float(expr[str(c)+str(p2g[ex])]))	
			if p2g[ex] in tsg:
                      		exp_list.append(float(expr[str(c)+str(p2g[ex])]))
                      		combined_probes.append(ex)
		for ex in cnv_g:
			if ex in tsg:	
                      		exp_list.append(float(cnv[str(c)+str(ex)]))	
                      		combined_cnvs.append(ex)
               	features['tsg'].append(exp_list)

    for i,c in enumerate(headers):
    	if (i-2)>=0:	
        	exp_list=[]
		for ex in genes:

			#if p2g[ex] in cancer_genes:
			#    exp_list.append(float(expr[str(c)+str(p2g[ex])]))	
			if p2g[ex] in essG:
                      		exp_list.append(float(expr[str(c)+str(p2g[ex])]))
                      		combined_probes.append(ex)
		for ex in cnv_g:
			if ex in essG:	
                      		exp_list.append(float(cnv[str(c)+str(ex)]))	
                      		combined_cnvs.append(ex)
               	features['essential'].append(exp_list)
    #combined features
    combined_probes=list(set(combined_probes))
    combined_cnvs=list(set(combined_cnvs))
    features['combined']=[]
    for i,c in enumerate(headers):
    	if (i-2)>=0:	
        	exp_list=[]
		for ex in combined_probes:
                	exp_list.append(float(expr1[str(c)+str(ex)]))
		for ex in combined_cnvs:
                	exp_list.append(float(cnv[str(c)+str(ex)]))
   		features['combined'].append(exp_list)

    combined_probes_test=list(set(combined_probes_test))
    combined_cnvs_test=list(set(combined_cnvs_test))
    features_test['combined']=[]
    for i,c in enumerate(headers_t):
    	if (i-2)>=0:	
        	exp_list=[]
		for ex in combined_probes:
                	exp_list.append(float(expr2[str(c)+str(ex)]))
		for ex in combined_cnvs:
                	exp_list.append(float(cnv2[str(c)+str(ex)]))
   		features_test['combined'].append(exp_list)
  

    '''
    #print gct header
    scores = {}
    for f in features:
        scores[f] = 0
    scoresp = {}
    for f in features:
        scoresp[f] = 0
    print "features ready"
    #header for the final submission
    out_file_f = open("prediction" + str(time.time()) + ".gct", "w")
    out_file_f.write("#1.2\n")
    out_file_f.write(
        str(len(geneInEssent)) + "\t" + str(len(headers_t) - 2) + "\n")
    for c in headers_t:
        if c != "Name":
            out_file_f.write("\t")
        out_file_f.write(str(c))
    out_file_f.write("\n")
    best = 0
    control = 0
    avg = 0

    rndGenes = random.sample(geneInEssent, 1000)
    #for g in rndGenes:
    for g in geneInEssent:
        #for g in priority:

        intr += 1
        #s1_prediction[g]={}
        yt = np.array(essent2[g])
        current_score = {}
        current_scorep = {}

        xt = np.array(full3)
        test = np.array(full3_test)
        selector = SelectPercentile(f_regression, percentile=18).fit(xt, yt)
        xt2 = selector.transform(xt)
        test = selector.transform(test)

        eps2 = 3 * np.std(yt) * math.sqrt(math.log(len(yt)) / len(yt))
        cc = max(abs(np.mean(yt) + np.std(yt)), abs(np.mean(yt) - np.std(yt)))
        knn2 = svm.SVR(C=cc, epsilon=eps2)
        res = knn2.fit(xt2, yt).predict(test)
        out_file_f.write(g + "\t" + g)
        for p in res:
            out_file_f.write("\t" + str(p))
        out_file_f.write("\n")
        out_file_f.flush()

        #best+=max(current_score)

    out_file_f.close()
    return 0
Example #14
0
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
import mlpy
from sklearn.cluster import KMeans

#Cargamos los datos y graficamos

datos=load_iris()
dat=datos.data
caract_names=datos.feature_names
tar=datos.target

#Calculamos los cluster

cls, means, steps = mlpy.kmeans(dat, k=3, plus=True)

#steps
#Esta variable permite conocer los pasos que realizó el algoritmo para terminar

#Construimos las gráficas correspondiente

plt.subplot(2,1,1)
fig = plt.figure(1)
fig.suptitle("Ejemplo de k-medias",fontsize=15)
plot1 = plt.scatter(dat[:,0], dat[:,1], c=cls, alpha=0.75)
#Agregamos las Medias a las gráficas

plot2 = plt.scatter(means[:,0], means[:,1],c=[1,2,3], s=128, marker='d')
#plt.show()
Example #15
0
import numpy as np
import mlpy
import json
import sys

NR_CLUSTERS = 5
if len(sys.argv) > 1:
    try:
        NR_CLUSTERS = int(sys.argv[1])
    except:
        pass

emails = np.genfromtxt('email-features.csv', delimiter=',')
cls, means, steps = mlpy.kmeans(emails, k=NR_CLUSTERS, plus=True)

print steps, "steps"
print len(means), "clusters"

emails_file = open("emails_parsed_0.json")
emails = json.load(emails_file)

f = open("classifications.txt", "w")
for i in range(len(emails)):
    f.write( "========== EMAIL ==========\n" )
    f.write("==== CLASS " + str(cls[i]) + "\n")
    f.write("==== subject: " + emails[i]['subject'] + '\n')
    f.write("==== email_text: \n" + (emails[i]['email_text']).encode('utf-8') + '\n')

f.close()
Example #16
0
dirlist = os.listdir(dir)

for file in dirlist:
    
    mypath = dir + '/' + file
    info = os.stat(mypath)
    
    # don't deal with directories yet
    if info.st_size > 0:
        finfo = [file,float(info.st_size)/1024,info.st_mtime,info.st_mode]
        dinfo += [finfo]
    
# cluster by size
infos = [[log(x[1])] for x in dinfo]

cls,means,steps = kmeans(infos,k=10,plus=True)

ctmp = -1
data = zip([x[0] for x in dinfo],cls,infos)
for f,c,s in sorted(data,key=lambda tempkey: tempkey[2]):
    
    if c != ctmp:
        print '###############################'
        
    print c,'\t',round(s[0],1),'\t\t',f
        
    ctmp = c



Example #17
0
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(
        "data/knnSpeakerAll")
    [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(
        "data/knnSpeakerFemaleMale")

    [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin),
                                                                  round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
                       53]  # SET 1A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    # iFeaturesSelect = range(100);                                                                                                    # SET 3
    # MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = numpy.min(MidTermFeatures[1,:])
    # EnergyMean = numpy.mean(MidTermFeatures[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    # print iNonOutLiers

    perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        # [mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        # for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1),
            i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        # mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        # DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        # MDistancesAll = numpy.mean(DistancesAll)
        # iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1],))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        # print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio);
        clf = LDA(n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)  # perform k-means clustering

        # YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
        # print distance.squareform(YDist).shape
        # hc = mlpy.HCluster()
        # hc.linkage(YDist)
        # cls = hc.cut(14.5)
        # print cls

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = [];
        silB = []
        for c in range(iSpeakers):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T)  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(iSpeakers):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
                        silBs.append(numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(min(silBs))  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    # silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    return nSpeakersFinal
Example #18
0
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False):
	'''
	ARGUMENTS:
		- fileName:		the name of the WAV file to be analyzed
		- numOfSpeakers	the number of speakers (clusters) in the recording (<=0 for unknown)
		- mtSize (opt)	mid-term window size
		- mtStep (opt)	mid-term window step
		- stWin  (opt)	short-term window size
		- LDAdim (opt)	LDA dimension (0 for no LDA)
		- PLOT	 (opt)	0 for not plotting the results 1 for plottingy
	'''
	[Fs, x] = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x);
	Duration = len(x) / Fs

	[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll")
	[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale")

	[MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5));

	MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) )

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:,i] - MEAN1)  / STD1
		curF2 = (MidTermFeatures[:,i] - MEAN2)  / STD2
		[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
		[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
		MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001;
		MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
	
	MidTermFeatures = MidTermFeatures2	# TODO	
	# SELECT FEATURES:
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; 																											# SET 0A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; 																									# SET 0B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 0C
	
	iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; 																	# SET 1A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 															# SET 1B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 1C
	
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; 			# SET 2A		
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 	# SET 2B
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 2C
	
	#iFeaturesSelect = range(100);																									# SET 3	
	#MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010  
	
	MidTermFeatures = MidTermFeatures[iFeaturesSelect,:]		
	
	(MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T	
	numOfWindows = MidTermFeatures.shape[1]

	# remove outliers:
	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0]
	
	# TODO: Combine energy threshold for outlier removal:
	#EnergyMin = numpy.min(MidTermFeatures[1,:])
	#EnergyMean = numpy.mean(MidTermFeatures[1,:])
	#Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
	#iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
	#print iNonOutLiers

	perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows	
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]
	
	# LDA dimensionality reduction:
	if LDAdim > 0:
		#[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));		
		# extract mid-term features with minimum step:
		mtWinRatio  = int(round(mtSize  / stWin));
		mtStepRatio = int(round(stWin / stWin));
		mtFeaturesToReduce = []			
		numOfFeatures = len(ShortTermFeatures)
		numOfStatistics = 2;			
		#for i in range(numOfStatistics * numOfFeatures + 1):
		for i in range(numOfStatistics * numOfFeatures):
			mtFeaturesToReduce.append([])

		for i in range(numOfFeatures):		# for each of the short-term features:
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos<N):
				N1 = curPos
				N2 = curPos + mtWinRatio
				if N2 > N:
					N2 = N
				curStFeatures = ShortTermFeatures[i][N1:N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))				
				curPos += mtStepRatio		
		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
				
		mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) )
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:,i] - MEAN1)  / STD1
			curF2 = (mtFeaturesToReduce[:,i] - MEAN2)  / STD2
			[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
			[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
			mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001;
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
		mtFeaturesToReduce = mtFeaturesToReduce2		
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:]		
		#mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
		(mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])	
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
		#DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
		#MDistancesAll = numpy.mean(DistancesAll)
		#iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
		#mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1],));
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin
		#print LDAstep, LDAstepRatio
		for i in range(Labels.shape[0]):
			Labels[i] = int(i*stWin/LDAstepRatio);		
		clf = LDA(n_components=LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	if numOfSpeakers<=0:
		sRange = range(2,10)
	else:
		sRange = [numOfSpeakers]
	clsAll = []; silAll = []; centersAll = []
	
	for iSpeakers in sRange:
		cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)		# perform k-means clustering
		
		#YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
		#print distance.squareform(YDist).shape
		#hc = mlpy.HCluster()
		#hc.linkage(YDist)
		#cls = hc.cut(14.5)
		#print cls

		# Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
		clsAll.append(cls)
		centersAll.append(means)
		silA = []; silB = []
		for c in range(iSpeakers):								# for each speaker (i.e. for each extracted cluster)
			clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.020:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]			# get subset of feature vectors
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)				# compute average distance between samples that belong to the cluster (a values)
				silA.append(numpy.mean(Yt)*clusterPerCent)
				silBs = []
				for c2 in range(iSpeakers):						# compute distances from samples of other clusters
					if c2!=c:
						clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
				silBs = numpy.array(silBs)							
				silB.append(min(silBs))							# ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
		silA = numpy.array(silA); 
		silB = numpy.array(silB); 
		sil = []
		for c in range(iSpeakers):								# for each cluster (speaker)
			sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )		# compute silhouette

		silAll.append(numpy.mean(sil))								# keep the AVERAGE SILLOUETTE

	#silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
	imax = numpy.argmax(silAll)									# position of the maximum sillouette value
	nSpeakersFinal = sRange[imax]									# optimal number of clusters

	# generate the final set of cluster labels
	# (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
	cls = numpy.zeros((numOfWindows,))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i-iNonOutLiers))		
		cls[i] = clsAll[imax][j]
		
	# Post-process method 1: hmm smoothing
	for i in range(1):
		startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
		hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)			# hmm training
		hmm.means_ = means; hmm.covars_ = cov
		cls = hmm.predict(MidTermFeaturesNormOr.T)					
	
	# Post-process method 2: median filtering:
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]										# final sillouette
	classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


	# load ground-truth if available
	gtFile = fileName.replace('.wav', '.segments');							# open for annotated file
	if os.path.isfile(gtFile):									# if groundturh exists
		[segStart, segEnd, segLabels] = readSegmentGT(gtFile)					# read GT data
		flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)			# convert to flags

	if PLOT:
		fig = plt.figure()	
		if numOfSpeakers>0:
			ax1 = fig.add_subplot(111)
		else:
			ax1 = fig.add_subplot(211)
		ax1.set_yticks(numpy.array(range(len(classNames))))
		ax1.axis((0, Duration, -1, len(classNames)))
		ax1.set_yticklabels(classNames)
		ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

	if os.path.isfile(gtFile):
		if PLOT:
			ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
		purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
		print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
		if PLOT:
			plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
	if PLOT:
		plt.xlabel("time (seconds)")
		#print sRange, silAll	
		if numOfSpeakers<=0:
			plt.subplot(212)
			plt.plot(sRange, silAll)
			plt.xlabel("number of clusters");
			plt.ylabel("average clustering's sillouette");
		plt.show()
import numpy as np
import matplotlib.pyplot as plt
import mlpy
np.random.seed(0)
mean1, cov1, n1 = [1, 5], [[1, 1], [1, 2]], 200  # 200 points, mean=(1,5)
x1 = np.random.multivariate_normal(mean1, cov1, n1)
mean2, cov2, n2 = [2.5, 2.5], [[1, 0], [0,
                                        1]], 300  # 300 points, mean=(2.5,2.5)
x2 = np.random.multivariate_normal(mean2, cov2, n2)
mean3, cov3, n3 = [5, 8], [[0.5, 0], [0, 0.5]], 200  # 200 points, mean=(5,8)
x3 = np.random.multivariate_normal(mean3, cov3, n3)
x = np.concatenate((x1, x2, x3), axis=0)  # concatenate the samples
cls, means, steps = mlpy.kmeans(x, k=3, plus=True)
steps
fig = plt.figure(1)
plot1 = plt.scatter(x[:, 0], x[:, 1], c=cls, alpha=0.75)
plot2 = plt.scatter(means[:, 0],
                    means[:, 1],
                    c=np.unique(cls),
                    s=128,
                    marker='d')  # plot the means
plt.show()