def xval(self, nfold=5, nrep=10, pairwiseGTA=None, pairwiseViral=None, cluster_type='farthest', d=0.03): """n-fold cross validation of the test set. Input: n (int): number of folds for xval Returns: (fpr, fnr): false positive and false negative rates from the n xvals """ # keep track of label classification score0 = 0.0 score1 = 0.0 gta_as_phage = [] phage_as_gta = [] # repeat xval results nrep times for i in range(nrep): if not mini: sys.stdout.flush() sys.stdout.write("Starting rep: %d\r" % (i+1)) # randomly sort profiles random.shuffle(self.profiles) # split into folds split = [self.profiles[i::nfold] for i in range(nfold)] # cross val for j in range(nfold): # Build train and test sets train_fold = np.array([x for sublist in (split[:j]+split[j+1:]) for x in sublist]) test_fold = split[j] trainX = np.array([x.features for x in train_fold]) testX = np.array([x.features for x in test_fold]) trainY = np.array([-1.0 if y.label == self.label0 else 1.0 for y in train_fold]) testY = np.array([-1.0 if y.label == self.label0 else 1.0 for y in test_fold]) testNames = np.array([x.org_name for x in test_fold]) # randomize labels # random.shuffle(trainY) # Get training set weights if pairwiseGTA: # Reweight based on training set # GTA GTA_weight = Weight([x for x in train_fold if x.label == self.label0], pairwiseGTA) GTA_clusters = GTA_weight.cluster(cluster_type, d) GTA_weight.weight(GTA_clusters) # Virus virus_weight = Weight([x for x in train_fold if x.label != self.label0], pairwiseViral) virus_clusters = virus_weight.cluster(cluster_type, d) virus_weight.weight(virus_clusters) # Grab updated weights weights = np.array([x.weight for x in train_fold]) else: weights = np.array([1 for x in train_fold]) # evaluate results predictor = SVMTrain(self.kernel, self.c).train(trainX, trainY, weights) for r in range(len(testX)): # Positive product is correct classification if predictor.predict(testX[r]) * testY[r] > 0: # Update label0 if negative, label1 otherwise if testY[r] < 0: score0 += 1 else: score1 += 1 else: # predicted incorrectly if testY[r] > 0: #virus as GTA phage_as_gta.append(testNames[r]) else: #gta as virus gta_as_phage.append(testNames[r]) if not mini: print("\nPhages (%d) misclassified over %d reps: %s" % (len(phage_as_gta), nrep, phage_as_gta)) print("\nGTA (%d) misclassified over %d reps: %s\n" % (len(gta_as_phage), nrep, gta_as_phage)) return (score0/nrep, score1/nrep)
def xval(self, nfold=5, nrep=10, pairwiseGTA=None, pairwiseViral=None, cluster_type='farthest', d=0.03): """n-fold cross validation of the test set. Input: n (int): number of folds for xval Returns: (fpr, fnr): false positive and false negative rates from the n xvals """ # keep track of label classification score0 = 0.0 score1 = 0.0 gta_as_phage = [] phage_as_gta = [] # repeat xval results nrep times for i in range(nrep): if not mini: sys.stdout.flush() sys.stdout.write("Starting rep: %d\r" % (i + 1)) # randomly sort profiles random.shuffle(self.profiles) # split into folds split = [self.profiles[i::nfold] for i in range(nfold)] # cross val for j in range(nfold): # Build train and test sets train_fold = np.array([ x for sublist in (split[:j] + split[j + 1:]) for x in sublist ]) test_fold = split[j] trainX = np.array([x.features for x in train_fold]) testX = np.array([x.features for x in test_fold]) trainY = np.array([ -1.0 if y.label == self.label0 else 1.0 for y in train_fold ]) testY = np.array([ -1.0 if y.label == self.label0 else 1.0 for y in test_fold ]) testNames = np.array([x.org_name for x in test_fold]) # randomize labels # random.shuffle(trainY) # Get training set weights if pairwiseGTA: # Reweight based on training set # GTA GTA_weight = Weight( [x for x in train_fold if x.label == self.label0], pairwiseGTA) GTA_clusters = GTA_weight.cluster(cluster_type, d) GTA_weight.weight(GTA_clusters) # Virus virus_weight = Weight( [x for x in train_fold if x.label != self.label0], pairwiseViral) virus_clusters = virus_weight.cluster(cluster_type, d) virus_weight.weight(virus_clusters) # Grab updated weights weights = np.array([x.weight for x in train_fold]) else: weights = np.array([1 for x in train_fold]) # evaluate results predictor = SVMTrain(self.kernel, self.c).train(trainX, trainY, weights) for r in range(len(testX)): # Positive product is correct classification if predictor.predict(testX[r]) * testY[r] > 0: # Update label0 if negative, label1 otherwise if testY[r] < 0: score0 += 1 else: score1 += 1 else: # predicted incorrectly if testY[r] > 0: #virus as GTA phage_as_gta.append(testNames[r]) else: #gta as virus gta_as_phage.append(testNames[r]) if not mini: print("\nPhages (%d) misclassified over %d reps: %s" % (len(phage_as_gta), nrep, phage_as_gta)) print("\nGTA (%d) misclassified over %d reps: %s\n" % (len(gta_as_phage), nrep, gta_as_phage)) return (score0 / nrep, score1 / nrep)
if args.kmer == None and args.pseaac == None and not args.physico: print("You must specify at least one feature type (-k, -p, -y).") else: # Weight if needed if args.weight: # Get distance threshold d = args.dist[0] # Get cluster type cluster_type = args.cluster_type[0] # Weight GTA pairwiseGTA = Weight.load(args.weight[0]) GTA_weight = Weight(gta_profs, pairwiseGTA) GTA_clusters = GTA_weight.cluster(cluster_type, d) GTA_weight.weight(GTA_clusters) # Weight Virus pairwiseViral = Weight.load(args.weight[1]) virus_weight = Weight(viral_profs, pairwiseViral) virus_clusters = virus_weight.cluster(cluster_type, d) virus_weight.weight(virus_clusters) # Create SVM c = args.c[0] kernel = args.kernel[0] kernel_var = float(args.kernel[1]) svm = SVM(gta_profs, viral_profs, c, kernel, kernel_var) # Print support vectors if args.svs: