Exemple #1
0
	def xval(self, nfold=5, nrep=10, pairwiseGTA=None, pairwiseViral=None, cluster_type='farthest', d=0.03):
		"""n-fold cross validation of
			the test set. 
			Input:
				n (int): number of folds for xval
			Returns:
				(fpr, fnr): false positive and false 
					negative rates from the n xvals
		"""
		# keep track of label classification
		score0 = 0.0
		score1 = 0.0
		gta_as_phage = []
		phage_as_gta = []

		# repeat xval results nrep times
		for i in range(nrep):
			if not mini:
				sys.stdout.flush()
				sys.stdout.write("Starting rep: %d\r" % (i+1))
			# randomly sort profiles
			random.shuffle(self.profiles)
			# split into folds
			split = [self.profiles[i::nfold] for i in range(nfold)]
			# cross val
			for j in range(nfold):
				# Build train and test sets
				train_fold = np.array([x for sublist in (split[:j]+split[j+1:]) for x in sublist])
				test_fold = split[j]
				trainX = np.array([x.features for x in train_fold])
				testX = np.array([x.features for x in test_fold])
				trainY = np.array([-1.0 if y.label == self.label0 else 1.0 for y in train_fold])
				testY = np.array([-1.0 if y.label == self.label0 else 1.0 for y in test_fold])
				testNames = np.array([x.org_name for x in test_fold])
				# randomize labels
				# random.shuffle(trainY)
				# Get training set weights
				if pairwiseGTA:
					# Reweight based on training set
					# GTA
					GTA_weight = Weight([x for x in train_fold if x.label == self.label0], pairwiseGTA)
					GTA_clusters = GTA_weight.cluster(cluster_type, d)
					GTA_weight.weight(GTA_clusters)
					# Virus
					virus_weight = Weight([x for x in train_fold if x.label != self.label0], pairwiseViral)
					virus_clusters = virus_weight.cluster(cluster_type, d)
					virus_weight.weight(virus_clusters)
					# Grab updated weights
					weights = np.array([x.weight for x in train_fold])
				else:
					weights = np.array([1 for x in train_fold])
				# evaluate results
				predictor = SVMTrain(self.kernel, self.c).train(trainX, trainY, weights)
				for r in range(len(testX)):
					# Positive product is correct classification
					if predictor.predict(testX[r]) * testY[r] > 0:
						# Update label0 if negative, label1 otherwise
						if testY[r] < 0:
							score0 += 1
						else:
							score1 += 1
					else: # predicted incorrectly
						if testY[r] > 0: #virus as GTA
							phage_as_gta.append(testNames[r])
						else: #gta as virus
							gta_as_phage.append(testNames[r])

		if not mini:
			print("\nPhages (%d) misclassified over %d reps: %s" % (len(phage_as_gta), nrep, phage_as_gta))
			print("\nGTA (%d) misclassified over %d reps: %s\n" % (len(gta_as_phage), nrep, gta_as_phage))

		return (score0/nrep, score1/nrep)
Exemple #2
0
    def xval(self,
             nfold=5,
             nrep=10,
             pairwiseGTA=None,
             pairwiseViral=None,
             cluster_type='farthest',
             d=0.03):
        """n-fold cross validation of
			the test set. 
			Input:
				n (int): number of folds for xval
			Returns:
				(fpr, fnr): false positive and false 
					negative rates from the n xvals
		"""
        # keep track of label classification
        score0 = 0.0
        score1 = 0.0
        gta_as_phage = []
        phage_as_gta = []

        # repeat xval results nrep times
        for i in range(nrep):
            if not mini:
                sys.stdout.flush()
                sys.stdout.write("Starting rep: %d\r" % (i + 1))
            # randomly sort profiles
            random.shuffle(self.profiles)
            # split into folds
            split = [self.profiles[i::nfold] for i in range(nfold)]
            # cross val
            for j in range(nfold):
                # Build train and test sets
                train_fold = np.array([
                    x for sublist in (split[:j] + split[j + 1:])
                    for x in sublist
                ])
                test_fold = split[j]
                trainX = np.array([x.features for x in train_fold])
                testX = np.array([x.features for x in test_fold])
                trainY = np.array([
                    -1.0 if y.label == self.label0 else 1.0 for y in train_fold
                ])
                testY = np.array([
                    -1.0 if y.label == self.label0 else 1.0 for y in test_fold
                ])
                testNames = np.array([x.org_name for x in test_fold])
                # randomize labels
                # random.shuffle(trainY)
                # Get training set weights
                if pairwiseGTA:
                    # Reweight based on training set
                    # GTA
                    GTA_weight = Weight(
                        [x for x in train_fold if x.label == self.label0],
                        pairwiseGTA)
                    GTA_clusters = GTA_weight.cluster(cluster_type, d)
                    GTA_weight.weight(GTA_clusters)
                    # Virus
                    virus_weight = Weight(
                        [x for x in train_fold if x.label != self.label0],
                        pairwiseViral)
                    virus_clusters = virus_weight.cluster(cluster_type, d)
                    virus_weight.weight(virus_clusters)
                    # Grab updated weights
                    weights = np.array([x.weight for x in train_fold])
                else:
                    weights = np.array([1 for x in train_fold])
                # evaluate results
                predictor = SVMTrain(self.kernel,
                                     self.c).train(trainX, trainY, weights)
                for r in range(len(testX)):
                    # Positive product is correct classification
                    if predictor.predict(testX[r]) * testY[r] > 0:
                        # Update label0 if negative, label1 otherwise
                        if testY[r] < 0:
                            score0 += 1
                        else:
                            score1 += 1
                    else:  # predicted incorrectly
                        if testY[r] > 0:  #virus as GTA
                            phage_as_gta.append(testNames[r])
                        else:  #gta as virus
                            gta_as_phage.append(testNames[r])

        if not mini:
            print("\nPhages (%d) misclassified over %d reps: %s" %
                  (len(phage_as_gta), nrep, phage_as_gta))
            print("\nGTA (%d) misclassified over %d reps: %s\n" %
                  (len(gta_as_phage), nrep, gta_as_phage))

        return (score0 / nrep, score1 / nrep)
Exemple #3
0
    if args.kmer == None and args.pseaac == None and not args.physico:
        print("You must specify at least one feature type (-k, -p, -y).")

    else:
        # Weight if needed
        if args.weight:
            # Get distance threshold
            d = args.dist[0]
            # Get cluster type
            cluster_type = args.cluster_type[0]
            # Weight GTA
            pairwiseGTA = Weight.load(args.weight[0])
            GTA_weight = Weight(gta_profs, pairwiseGTA)
            GTA_clusters = GTA_weight.cluster(cluster_type, d)
            GTA_weight.weight(GTA_clusters)
            # Weight Virus
            pairwiseViral = Weight.load(args.weight[1])
            virus_weight = Weight(viral_profs, pairwiseViral)
            virus_clusters = virus_weight.cluster(cluster_type, d)
            virus_weight.weight(virus_clusters)

        # Create SVM
        c = args.c[0]
        kernel = args.kernel[0]
        kernel_var = float(args.kernel[1])

        svm = SVM(gta_profs, viral_profs, c, kernel, kernel_var)

        # Print support vectors
        if args.svs:
Exemple #4
0
	if args.kmer == None and args.pseaac == None and not args.physico:
		print("You must specify at least one feature type (-k, -p, -y).")

	else:
		# Weight if needed
		if args.weight:
			# Get distance threshold
			d = args.dist[0]
			# Get cluster type
			cluster_type = args.cluster_type[0]
			# Weight GTA
			pairwiseGTA = Weight.load(args.weight[0])
			GTA_weight = Weight(gta_profs, pairwiseGTA)
			GTA_clusters = GTA_weight.cluster(cluster_type, d)
			GTA_weight.weight(GTA_clusters)
			# Weight Virus
			pairwiseViral = Weight.load(args.weight[1])
			virus_weight = Weight(viral_profs, pairwiseViral)
			virus_clusters = virus_weight.cluster(cluster_type, d)
			virus_weight.weight(virus_clusters)

		# Create SVM
		c = args.c[0]
		kernel = args.kernel[0]
		kernel_var = float(args.kernel[1])

		svm = SVM(gta_profs, viral_profs, c, kernel, kernel_var)

		# Print support vectors
		if args.svs: